Skip to content

Commit 563cdc3

Browse files
comexggerganov
andauthored
Support calling mlock() on loaded model data on Linux and macOS (#453)
* Support calling mlock() on loaded model data on Linux and macOS This is enabled by a new --mlock command line option. Using mlock() disables swapping and memory compression for the model data. Doing so can be useful on systems where the model takes up a large fraction of system RAM. In my experience, macOS is quite eager to start compressing llama.cpp's memory, which then makes it halt for a few seconds while it decompresses, even with a model that uses "only" 25GB out of 32GB. Of course, this comes at the cost of forcing the system to swap or compress other processes' memory instead, so it needs to be used with care and shouldn't be enabled by default. In theory it should be possible to support this on Windows as well using VirtualLock(), but I'm not much of a Windows user. * Update llama.cpp --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent 8d4a855 commit 563cdc3

File tree

7 files changed

+91
-12
lines changed

7 files changed

+91
-12
lines changed

ggml.c

Lines changed: 65 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
// Defines CLOCK_MONOTONIC on Linux
2-
#define _POSIX_C_SOURCE 199309L
1+
// Defines CLOCK_MONOTONIC and asprintf on Linux
2+
#define _GNU_SOURCE
33

44
#include "ggml.h"
55

@@ -10,6 +10,7 @@
1010
#endif
1111

1212
#include <assert.h>
13+
#include <errno.h>
1314
#include <time.h>
1415
#include <math.h>
1516
#include <stdlib.h>
@@ -31,7 +32,6 @@
3132
#else
3233
// ref: https://github.com/ggerganov/whisper.cpp/issues/168
3334
#include <windows.h>
34-
#include <errno.h>
3535
#endif
3636

3737
typedef volatile LONG atomic_int;
@@ -83,6 +83,17 @@ typedef void* thread_ret_t;
8383
#define static_assert(cond, msg) _Static_assert(cond, msg)
8484
#endif
8585

86+
#define GGML_MLOCK_SUPPORT 0
87+
88+
#ifdef __has_include
89+
#if __has_include(<sys/mman.h>)
90+
#undef GGML_MLOCK_SUPPORT
91+
#define GGML_MLOCK_SUPPORT 1
92+
#include <sys/mman.h>
93+
#endif
94+
#endif
95+
96+
8697
/*#define GGML_PERF*/
8798
#define GGML_DEBUG 0
8899
#define GGML_GELU_FP16
@@ -2344,6 +2355,7 @@ struct ggml_context {
23442355
size_t mem_size;
23452356
void * mem_buffer;
23462357
bool mem_buffer_owned;
2358+
bool mem_buffer_mlocked;
23472359

23482360
int n_objects;
23492361

@@ -2619,16 +2631,19 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
26192631
}
26202632

26212633
*ctx = (struct ggml_context) {
2622-
/*.mem_size =*/ params.mem_size,
2623-
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
2624-
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
2625-
/*.n_objects =*/ 0,
2626-
/*.objects_begin =*/ NULL,
2627-
/*.objects_end =*/ NULL,
2628-
/*.scratch =*/ { 0, 0, NULL, },
2629-
/*.scratch_save =*/ { 0, 0, NULL, },
2634+
/*.mem_size =*/ params.mem_size,
2635+
/*.mem_buffer =*/ params.mem_buffer ? params.mem_buffer : malloc(params.mem_size),
2636+
/*.mem_buffer_owned =*/ params.mem_buffer ? false : true,
2637+
/*.mem_buffer_mlocked =*/ false,
2638+
/*.n_objects =*/ 0,
2639+
/*.objects_begin =*/ NULL,
2640+
/*.objects_end =*/ NULL,
2641+
/*.scratch =*/ { 0, 0, NULL, },
2642+
/*.scratch_save =*/ { 0, 0, NULL, },
26302643
};
26312644

2645+
GGML_ASSERT(ctx->mem_buffer != NULL); // check for allocation failure
2646+
26322647
ggml_assert_aligned(ctx->mem_buffer);
26332648

26342649
GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
@@ -2651,6 +2666,14 @@ void ggml_free(struct ggml_context * ctx) {
26512666
GGML_PRINT_DEBUG("%s: context %d with %d objects has been freed. memory used = %zu\n",
26522667
__func__, i, ctx->n_objects, ctx->objects_end->offs + ctx->objects_end->size);
26532668

2669+
#if GGML_MLOCK_SUPPORT
2670+
if (ctx->mem_buffer_mlocked) {
2671+
if (munlock(ctx->mem_buffer, ctx->mem_size)) {
2672+
fprintf(stderr, "%s: failed to munlock buffer: %s\n", __func__, strerror(errno));
2673+
}
2674+
}
2675+
#endif
2676+
26542677
if (ctx->mem_buffer_owned) {
26552678
free(ctx->mem_buffer);
26562679
}
@@ -2679,6 +2702,37 @@ size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch)
26792702
return result;
26802703
}
26812704

2705+
bool ggml_mlock_supported(void) {
2706+
return GGML_MLOCK_SUPPORT;
2707+
}
2708+
2709+
#if GGML_MLOCK_SUPPORT
2710+
#ifdef __APPLE__
2711+
#define MLOCK_SUGGESTION "Try increasing the sysctl values 'vm.user_wire_limit' and 'vm.global_user_wire_limit' and/or\n" \
2712+
"decreasing 'vm.global_no_user_wire_amount'. Also try increasing RLIMIT_MLOCK (ulimit -l)."
2713+
#else
2714+
#define MLOCK_SUGGESTION "Try increasing RLIMIT_MLOCK (ulimit -l)."
2715+
#endif
2716+
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
2717+
if (ctx->mem_buffer_mlocked) {
2718+
return true;
2719+
}
2720+
if (mlock(ctx->mem_buffer, ctx->mem_size)) {
2721+
int ret = asprintf(err_p, "failed to mlock %zu-byte buffer: %s\n" MLOCK_SUGGESTION,
2722+
ctx->mem_size, strerror(errno));
2723+
GGML_ASSERT(ret >= 0);
2724+
return false;
2725+
}
2726+
ctx->mem_buffer_mlocked = true;
2727+
return true;
2728+
}
2729+
#else // GGML_MLOCK_SUPPORT
2730+
bool ggml_mlock(struct ggml_context * ctx, char ** err_p) {
2731+
*err_p = strdup("can't mlock because it's not supported on this system");
2732+
return false;
2733+
}
2734+
#endif // GGML_MLOCK_SUPPORT
2735+
26822736
////////////////////////////////////////////////////////////////////////////////
26832737

26842738
struct ggml_tensor * ggml_new_tensor_impl(

ggml.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -343,6 +343,9 @@ size_t ggml_used_mem(const struct ggml_context * ctx);
343343

344344
size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
345345

346+
bool ggml_mlock_supported(void);
347+
bool ggml_mlock(struct ggml_context * ctx, char ** err_p);
348+
346349
struct ggml_tensor * ggml_new_tensor(
347350
struct ggml_context * ctx,
348351
enum ggml_type type,

llama.cpp

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ struct llama_context_params llama_context_default_params() {
115115
/*.f16_kv =*/ false,
116116
/*.logits_all =*/ false,
117117
/*.vocab_only =*/ false,
118+
/*.use_mlock =*/ false,
118119
/*.embedding =*/ false,
119120
};
120121

@@ -1428,11 +1429,22 @@ struct llama_context * llama_init_from_file(
14281429

14291430
ggml_type type_memory = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
14301431

1431-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory, params.vocab_only)) {
1432+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_parts, type_memory,
1433+
params.vocab_only)) {
14321434
fprintf(stderr, "%s: failed to load model\n", __func__);
14331435
delete ctx;
14341436
return nullptr;
14351437
}
1438+
1439+
if (params.use_mlock) {
1440+
char *err;
1441+
if (!ggml_mlock(ctx->model.ctx, &err)) {
1442+
fprintf(stderr, "%s\n", err);
1443+
free(err);
1444+
delete ctx;
1445+
return nullptr;
1446+
}
1447+
}
14361448

14371449
// reserve memory for context buffers
14381450
{

llama.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,7 @@ extern "C" {
5353
bool f16_kv; // use fp16 for KV cache
5454
bool logits_all; // the llama_eval() call computes all logits, not just the last one
5555
bool vocab_only; // only load the vocabulary, no weights
56+
bool use_mlock; // force system to keep model in RAM
5657
bool embedding; // embedding mode only
5758
};
5859

main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,7 @@ int main(int argc, char ** argv) {
199199
lparams.seed = params.seed;
200200
lparams.f16_kv = params.memory_f16;
201201
lparams.logits_all = params.perplexity;
202+
lparams.use_mlock = params.use_mlock;
202203
lparams.embedding = params.embedding;
203204

204205
ctx = llama_init_from_file(params.model.c_str(), lparams);

utils.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
#include "ggml.h"
2+
13
#include "utils.h"
24

35
#include <cassert>
@@ -127,6 +129,8 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
127129
params.instruct = true;
128130
} else if (arg == "--color") {
129131
params.use_color = true;
132+
} else if (arg == "--mlock") {
133+
params.use_mlock = true;
130134
} else if (arg == "-r" || arg == "--reverse-prompt") {
131135
if (++i >= argc) {
132136
invalid_param = true;
@@ -194,6 +198,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
194198
fprintf(stderr, " --n_parts N number of model parts (default: -1 = determine from dimensions)\n");
195199
fprintf(stderr, " -b N, --batch_size N batch size for prompt processing (default: %d)\n", params.n_batch);
196200
fprintf(stderr, " --perplexity compute perplexity over the prompt\n");
201+
if (ggml_mlock_supported()) {
202+
fprintf(stderr, " --mlock force system to keep model in RAM rather than swapping or compressing\n");
203+
}
197204
fprintf(stderr, " -m FNAME, --model FNAME\n");
198205
fprintf(stderr, " model path (default: %s)\n", params.model.c_str());
199206
fprintf(stderr, "\n");

utils.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ struct gpt_params {
4646
bool instruct = false; // instruction mode (used for Alpaca models)
4747
bool ignore_eos = false; // do not stop generating after eos
4848
bool perplexity = false; // compute perplexity over the prompt
49+
bool use_mlock = false; // use mlock to keep model in memory
4950
};
5051

5152
bool gpt_params_parse(int argc, char ** argv, gpt_params & params);

0 commit comments

Comments
 (0)