Skip to content

Support calling mlock() on loaded model data on Linux and macOS #453

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Mar 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 65 additions & 11 deletions ggml.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
// Defines CLOCK_MONOTONIC on Linux
#define _POSIX_C_SOURCE 199309L
// Defines CLOCK_MONOTONIC and asprintf on Linux
#define _GNU_SOURCE

#include "ggml.h"

Expand All @@ -10,6 +10,7 @@
#endif

#include <assert.h>
#include <errno.h>
#include <time.h>
#include <math.h>
#include <stdlib.h>
Expand All @@ -31,7 +32,6 @@
#else
// ref: https://github.com/ggerganov/whisper.cpp/issues/168
#include <windows.h>
#include <errno.h>
#endif

typedef volatile LONG atomic_int;
Expand Down Expand Up @@ -83,6 +83,17 @@ typedef void* thread_ret_t;
#define static_assert(cond, msg) _Static_assert(cond, msg)
#endif

#define GGML_MLOCK_SUPPORT 0

#ifdef __has_include
#if __has_include(<sys/mman.h>)
#undef GGML_MLOCK_SUPPORT
#define GGML_MLOCK_SUPPORT 1
#include <sys/mman.h>
#endif
#endif


/*#define GGML_PERF*/
#define GGML_DEBUG 0
#define GGML_GELU_FP16
Expand Down Expand Up @@ -2344,6 +2355,7 @@ struct ggml_context {
size_t mem_size;
void * mem_buffer;
bool mem_buffer_owned;
bool mem_buffer_mlocked;

int n_objects;

Expand Down Expand Up @@ -2619,16 +2631,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
}

*ctx = (struct ggml_context) {
/*.mem_size =*/ params.mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.n_objects =*/ 0,
/*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, },
/*.scratch_save =*/ { 0, 0, NULL, },
/*.mem_size =*/ params.mem_size,
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
/*.mem_buffer_mlocked =*/ false,
/*.n_objects =*/ 0,
/*.objects_begin =*/ NULL,
/*.objects_end =*/ NULL,
/*.scratch =*/ { 0, 0, NULL, },
/*.scratch_save =*/ { 0, 0, NULL, },
};

GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure

ggml_assert_aligned(ctx->mem_buffer);

GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
Expand All @@ -2651,6 +2666,14 @@ void ggml_free(struct ggml_context * ctx) {
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);

#if GGML_MLOCK_SUPPORT
if (ctx->mem_buffer_mlocked) {
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
}
}
#endif

if (ctx->mem_buffer_owned) {
free(ctx->mem_buffer);
}
Expand Down Expand Up @@ -2679,6 +2702,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
return result;
}

bool ggml_mlock_supported(void) {
return GGML_MLOCK_SUPPORT;
}

#if GGML_MLOCK_SUPPORT
#ifdef __APPLE__
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
#else
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
#endif
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
if (ctx->mem_buffer_mlocked) {
return true;
}
if (mlock(ctx->mem_buffer, ctx->mem_size)) {
int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
ctx->mem_size, strerror(errno));
GGML_ASSERT(ret >= 0);
return false;
}
ctx->mem_buffer_mlocked = true;
return true;
}
#else // GGML_MLOCK_SUPPORT
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
*err_p = strdup("can't mlock because it's not supported on this system");
return false;
}
#endif // GGML_MLOCK_SUPPORT

////////////////////////////////////////////////////////////////////////////////

struct ggml_tensor * ggml_new_tensor_impl(
Expand Down
3 changes: 3 additions & 0 deletions ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);

size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);

bool ggml_mlock_supported(void);
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);

struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
Expand Down
14 changes: 13 additions & 1 deletion llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,7 @@ struct llama_context_params llama_context_default_params() {
/*.f16_kv =*/ false,
/*.logits_all =*/ false,
/*.vocab_only =*/ false,
/*.use_mlock =*/ false,
/*.embedding =*/ false,
};

Expand Down Expand Up @@ -1428,11 +1429,22 @@ struct llama_context * llama_init_from_file(

ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;

if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
params.vocab_only)) {
fprintf(stderr, "%s: failed to load model\n", __func__);
delete ctx;
return nullptr;
}

if (params.use_mlock) {
char *err;
if (!ggml_mlock(ctx->model.ctx, &err)) {
fprintf(stderr, "%s\n", err);
free(err);
delete ctx;
return nullptr;
}
}

// reserve memory for context buffers
{
Expand Down
1 change: 1 addition & 0 deletions llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ extern "C" {
bool f16_kv; // use fp16 for KV cache
bool logits_all; // the llama_eval() call computes all logits, not just the last one
bool vocab_only; // only load the vocabulary, no weights
bool use_mlock; // force system to keep model in RAM
bool embedding; // embedding mode only
};

Expand Down
1 change: 1 addition & 0 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ int main(int argc, char ** argv) {
lparams.seed = params.seed;
lparams.f16_kv = params.memory_f16;
lparams.logits_all = params.perplexity;
lparams.use_mlock = params.use_mlock;
lparams.embedding = params.embedding;

ctx = llama_init_from_file(params.model.c_str(), lparams);
Expand Down
7 changes: 7 additions & 0 deletions utils.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "ggml.h"

#include "utils.h"

#include <cassert>
Expand Down Expand Up @@ -127,6 +129,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
params.instruct = true;
} else if (arg == "--color") {
params.use_color = true;
} else if (arg == "--mlock") {
params.use_mlock = true;
} else if (arg == "-r" || arg == "--reverse-prompt") {
if (++i >= argc) {
invalid_param = true;
Expand Down Expand Up @@ -194,6 +198,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
if (ggml_mlock_supported()) {
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
}
fprintf(stderr, " -m FNAME, --model FNAME\n");
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
fprintf(stderr, "\n");
Expand Down
1 change: 1 addition & 0 deletions utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ struct gpt_params {
bool instruct = false; // instruction mode (used for Alpaca models)
bool ignore_eos = false; // do not stop generating after eos
bool perplexity = false; // compute perplexity over the prompt
bool use_mlock = false; // use mlock to keep model in memory
};

bool gpt_params_parse(int argc, char ** argv, gpt_params & params);
Expand Down