Skip to content

Fix regression of model loading performance when using mlock. #2204

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 4 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
lparams.use_mlock = params.use_mlock;
lparams.logits_all = params.perplexity;
lparams.embedding = params.embedding;
lparams.has_lora = !params.lora_adapter.empty();

return lparams;
}
Expand Down
6 changes: 3 additions & 3 deletions llama-util.h
Original file line number Diff line number Diff line change
Expand Up @@ -172,16 +172,16 @@ struct llama_mmap {
#ifdef _POSIX_MAPPED_FILES
static constexpr bool SUPPORTED = true;

llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false) {
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */, bool numa = false, bool has_lora = false) {
size = file->size;
int fd = fileno(file->fp);
int flags = MAP_PRIVATE;
int flags = (has_lora) ? MAP_PRIVATE : MAP_SHARED;
// prefetch/readahead impairs performance on NUMA systems
if (numa) { prefetch = 0; }
#ifdef __linux__
if (prefetch) { flags |= MAP_POPULATE; }
#endif
addr = mmap(NULL, file->size, PROT_READ | PROT_WRITE, flags, fd, 0);
addr = mmap(NULL, file->size, (has_lora) ? PROT_READ | PROT_WRITE : PROT_READ, flags, fd, 0);
if (addr == MAP_FAILED) {
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
}
Expand Down
21 changes: 13 additions & 8 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -632,16 +632,18 @@ struct llama_model_loader {
std::unique_ptr<llama_file_loader> file_loader;
llama_load_tensors_map tensors_map;
bool use_mmap;
bool has_lora;
size_t num_ggml_tensors_created = 0;
struct ggml_context * ggml_ctx = NULL;
std::unique_ptr<llama_mmap> mapping;

llama_model_loader(const std::string & fname_base, bool use_mmap) {
llama_model_loader(const std::string & fname_base, bool use_mmap, bool has_lora) {
file_loader = std::unique_ptr<llama_file_loader>(new llama_file_loader(fname_base.c_str(), tensors_map));
if (!llama_mmap::SUPPORTED) {
use_mmap = false;
}
this->use_mmap = use_mmap;
this->has_lora = has_lora;
}

void calc_sizes(size_t * ctx_size_p, size_t * mmapped_size_p) const {
Expand Down Expand Up @@ -707,7 +709,7 @@ struct llama_model_loader {
}

if (use_mmap) {
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa()));
mapping.reset(new llama_mmap(&file_loader->file, prefetch_size, ggml_is_numa(), this->has_lora));
if (lmlock) {
lmlock->init(mapping->addr);
}
Expand Down Expand Up @@ -853,6 +855,7 @@ struct llama_context_params llama_context_default_params() {
/*.use_mmap =*/ true,
/*.use_mlock =*/ false,
/*.embedding =*/ false,
/*.has_lora =*/ false,
};

return result;
Expand Down Expand Up @@ -971,13 +974,14 @@ static void llama_model_load_internal(
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
bool has_lora,
bool vocab_only,
llama_progress_callback progress_callback,
void * progress_callback_user_data) {

model.t_start_us = ggml_time_us();

std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap, has_lora));

vocab = std::move(ml->file_loader->vocab);
model.hparams = ml->file_loader->hparams;
Expand Down Expand Up @@ -1275,12 +1279,13 @@ static bool llama_model_load(
ggml_type memory_type,
bool use_mmap,
bool use_mlock,
bool has_lora,
bool vocab_only,
llama_progress_callback progress_callback,
void *progress_callback_user_data) {
try {
llama_model_load_internal(fname, model, vocab, n_ctx, n_batch, n_gpu_layers, main_gpu, tensor_split, low_vram, memory_type,
use_mmap, use_mlock, vocab_only, progress_callback, progress_callback_user_data);
use_mmap, use_mlock, has_lora, vocab_only, progress_callback, progress_callback_user_data);
return true;
} catch (const std::exception & err) {
fprintf(stderr, "error loading model: %s\n", err.what());
Expand Down Expand Up @@ -2447,7 +2452,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
nthread = std::thread::hardware_concurrency();
}

std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false));
std::unique_ptr<llama_model_loader> model_loader(new llama_model_loader(fname_inp, /*use_mmap*/ false, /*has_lora*/ false));
llama_file_saver file_saver(fname_out.c_str(), model_loader->file_loader.get(), params->ftype);

#ifdef GGML_USE_K_QUANTS
Expand Down Expand Up @@ -2676,7 +2681,7 @@ struct llama_model * llama_load_model_from_file(

if (!llama_model_load(path_model, *model, model->vocab, params.n_ctx, params.n_batch, params.n_gpu_layers,
params.main_gpu, params.tensor_split, params.low_vram, memory_type, params.use_mmap, params.use_mlock,
params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
params.has_lora, params.vocab_only, params.progress_callback, params.progress_callback_user_data)) {
delete model;
fprintf(stderr, "%s: failed to load model\n", __func__);
return nullptr;
Expand Down Expand Up @@ -2904,7 +2909,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
llama_buffer base_buf;
if (path_base_model) {
fprintf(stderr, "%s: loading base model from '%s'\n", __func__, path_base_model);
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*has_lora*/ true));

size_t ctx_size;
size_t mmapped_size;
Expand All @@ -2922,7 +2927,7 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const

// maybe this should in llama_model_loader
if (model_loader->use_mmap) {
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa()));
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loader->file, /* prefetch */ 0, ggml_is_numa(), model_loader->has_lora));
}
}

Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ extern "C" {
bool use_mmap; // use mmap if possible
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
bool has_lora; // a LoRA is being used
};
// model file types
enum llama_ftype {
Expand Down