Skip to content

Commit f853952

Browse files
committed
llama : refactor gguf_buffer and gguf_ctx_buffer
1 parent 797088a commit f853952

File tree

4 files changed

+101
-128
lines changed

4 files changed

+101
-128
lines changed

ggml-metal.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,9 @@ struct ggml_metal_context;
3838
struct ggml_metal_context * ggml_metal_init(int n_cb);
3939
void ggml_metal_free(struct ggml_metal_context * ctx);
4040

41+
void * ggml_metal_host_malloc(size_t n);
42+
void ggml_metal_host_free (void * data);
43+
4144
// set the number of command buffers to use
4245
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb);
4346

ggml-metal.m

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,21 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
224224
free(ctx);
225225
}
226226

227+
void * ggml_metal_host_malloc(size_t n) {
228+
void * data = NULL;
229+
const int result = posix_memalign((void **) &data, getpagesize(), n);
230+
if (result != 0) {
231+
fprintf(stderr, "%s: error: posix_memalign failed\n", __func__);
232+
return NULL;
233+
}
234+
235+
return data;
236+
}
237+
238+
void ggml_metal_host_free(void * data) {
239+
free(data);
240+
}
241+
227242
void ggml_metal_set_n_cb(struct ggml_metal_context * ctx, int n_cb) {
228243
ctx->n_cb = n_cb;
229244
}

gguf-llama.cpp

Lines changed: 83 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,6 @@
4747
#include <algorithm>
4848
#include <initializer_list>
4949
#include <thread>
50-
#include <atomic>
5150
#include <mutex>
5251
#include <sstream>
5352
#include <numeric>
@@ -92,6 +91,53 @@ static const size_t MB = 1024*1024;
9291

9392
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
9493

94+
#ifdef GGML_USE_CUBLAS
95+
#define llama_host_malloc(n) ggml_cuda_host_malloc(n)
96+
#define llama_host_free(data) ggml_cuda_host_free(data)
97+
#elif GGML_USE_METAL
98+
#define llama_host_malloc(n) ggml_metal_host_malloc(n)
99+
#define llama_host_free(data) ggml_metal_host_free(data)
100+
#else
101+
#define llama_host_malloc(n) malloc(n)
102+
#define llama_host_free(data) free(data)
103+
#endif
104+
105+
struct llama_buffer {
106+
void * data = NULL;
107+
size_t size = 0;
108+
109+
// fallback to malloc / free
110+
// useful in cases where CUDA can try to allocate PINNED memory
111+
bool fallback = false;
112+
113+
void resize(size_t n) {
114+
llama_host_free(data);
115+
116+
data = llama_host_malloc(n);
117+
if (!data) {
118+
fallback = true;
119+
data = malloc(n);
120+
} else {
121+
fallback = false;
122+
}
123+
124+
GGML_ASSERT(data);
125+
size = n;
126+
}
127+
128+
~llama_buffer() {
129+
if (data) {
130+
if (fallback) { // NOLINT
131+
free(data);
132+
} else {
133+
llama_host_free(data);
134+
}
135+
}
136+
137+
data = NULL;
138+
}
139+
};
140+
95141
void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
96142
(void) tensor;
97143
}
@@ -254,7 +300,7 @@ struct llama_kv_cache {
254300

255301
struct ggml_context * ctx = NULL;
256302

257-
gguf_ctx_buffer buf;
303+
llama_buffer buf;
258304

259305
int n; // number of tokens currently in the cache
260306

@@ -305,7 +351,7 @@ struct llama_model {
305351
struct ggml_context * ctx = NULL;
306352

307353
// the model memory buffer
308-
gguf_ctx_buffer buf;
354+
llama_buffer buf;
309355

310356
// model memory mapped file
311357
std::unique_ptr<gguf_mmap> mapping;
@@ -394,15 +440,15 @@ struct llama_context {
394440

395441
// memory buffers used to evaluate the model
396442
// TODO: move in llama_state
397-
gguf_ctx_buffer buf_compute;
443+
llama_buffer buf_compute;
398444

399445
#ifdef LLAMA_USE_ALLOCATOR
400-
gguf_ctx_buffer buf_alloc;
446+
llama_buffer buf_alloc;
401447
ggml_allocr * alloc = NULL;
402448
#endif
403449

404450
#ifdef LLAMA_USE_SCRATCH
405-
gguf_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
451+
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
406452

407453
int buf_last = 0;
408454
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };
@@ -416,15 +462,15 @@ struct llama_context {
416462
ggml_mpi_context * ctx_mpi = NULL;
417463
#endif
418464

419-
void use_buf(struct ggml_context * ctx, int i) {
465+
static void use_buf(struct ggml_context * ctx, int i) {
420466
#if defined(LLAMA_USE_SCRATCH)
421467
size_t last_size = 0;
422468

423469
if (i == -1) {
424470
last_size = ggml_set_scratch(ctx, { 0, 0, nullptr, });
425471
} else {
426472
auto & buf = buf_scratch[i];
427-
last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.addr, });
473+
last_size = ggml_set_scratch(ctx, { 0, buf.size, buf.data, });
428474
}
429475

430476
if (buf_last >= 0) {
@@ -438,7 +484,7 @@ struct llama_context {
438484
#endif
439485
}
440486

441-
size_t get_buf_max_mem(int i) const {
487+
static size_t get_buf_max_mem(int i) {
442488
#if defined(LLAMA_USE_SCRATCH)
443489
return buf_max_size[i];
444490
#else
@@ -1024,7 +1070,7 @@ static bool kv_cache_init(
10241070

10251071
struct ggml_init_params params;
10261072
params.mem_size = cache.buf.size;
1027-
params.mem_buffer = cache.buf.addr;
1073+
params.mem_buffer = cache.buf.data;
10281074
params.no_alloc = false;
10291075

10301076
cache.ctx = ggml_init(params);
@@ -1275,13 +1321,13 @@ static void llama_model_load_internal(
12751321
{
12761322
model.buf.resize(ctx_size);
12771323
if (use_mlock) {
1278-
model.mlock_buf.init (model.buf.addr);
1324+
model.mlock_buf.init (model.buf.data);
12791325
model.mlock_buf.grow_to(model.buf.size);
12801326
}
12811327

12821328
struct ggml_init_params params = {
12831329
/*.mem_size =*/ model.buf.size,
1284-
/*.mem_buffer =*/ model.buf.addr,
1330+
/*.mem_buffer =*/ model.buf.data,
12851331
/*.no_alloc =*/ ml->use_mmap,
12861332
};
12871333

@@ -1565,7 +1611,7 @@ static struct ggml_cgraph * llama_build_graph(
15651611

15661612
struct ggml_init_params params = {
15671613
/*.mem_size =*/ buf_compute.size,
1568-
/*.mem_buffer =*/ buf_compute.addr,
1614+
/*.mem_buffer =*/ buf_compute.data,
15691615
/*.no_alloc =*/ false,
15701616
};
15711617

@@ -3012,11 +3058,11 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
30123058
// quantization
30133059
//
30143060

3015-
static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, gguf_buffer & output, const int nelements, const int nthread) {
3016-
if (output.size < nelements * sizeof(float)) {
3017-
output.resize(nelements * sizeof(float));
3061+
static void llama_convert_tensor_internal(const gguf_load_tensor & tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
3062+
if (output.size() < nelements) {
3063+
output.resize(nelements);
30183064
}
3019-
float * f32_output = (float *) output.addr;
3065+
float * f32_output = (float *) output.data();
30203066

30213067
ggml_type_traits_t qtype;
30223068
if (ggml_is_quantized(tensor.type)) {
@@ -3134,10 +3180,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
31343180
};
31353181

31363182
size_t idx = 0;
3183+
3184+
std::vector<uint8_t> read_data;
3185+
std::vector<uint8_t> work;
3186+
31373187
for (gguf_load_tensor & tensor : model_loader->tensors_map.tensors) {
3138-
gguf_buffer read_data;
31393188
read_data.resize(tensor.size);
3140-
tensor.data = read_data.addr;
3189+
tensor.data = read_data.data();
31413190
model_loader->load_data_for(tensor);
31423191

31433192
LLAMA_LOG_INFO("[%4zu/%4zu] %36s - %16s, type = %6s, ",
@@ -3156,7 +3205,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
31563205
enum ggml_type new_type;
31573206
void * new_data;
31583207
size_t new_size;
3159-
gguf_buffer work;
31603208

31613209
if (!quantize) {
31623210
new_type = tensor.type;
@@ -3214,35 +3262,36 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
32143262
}
32153263
#endif
32163264

3265+
const size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
3266+
32173267
float * f32_data;
3218-
size_t nelements = tensor.ne.at(0) * tensor.ne.at(1);
3219-
gguf_buffer f32_conv_buf;
3268+
std::vector<float> f32_conv_buf;
32203269

32213270
if (tensor.type == GGML_TYPE_F32) {
32223271
f32_data = (float *) tensor.data;
32233272
} else if (ggml_is_quantized(tensor.type) && !params->allow_requantize) {
32243273
throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor.type)));
32253274
} else {
32263275
llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
3227-
f32_data = (float *) f32_conv_buf.addr;
3276+
f32_data = (float *) f32_conv_buf.data();
32283277
}
32293278

32303279
LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
32313280
fflush(stdout);
32323281

32333282
work.resize(nelements * 4); // upper bound on size
3234-
new_data = work.addr;
3283+
new_data = work.data();
32353284
std::vector<int64_t> hist_cur(1 << 4, 0);
32363285

3237-
int chunk_size = 32 * 512;
3286+
const int chunk_size = 32 * 512;
32383287
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
32393288
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
32403289
if (nthread_use < 2) {
32413290
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
32423291
} else {
32433292
size_t counter = 0;
32443293
new_size = 0;
3245-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements, chunk_size] () {
3294+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements] () {
32463295
std::vector<int64_t> local_hist;
32473296
size_t local_size = 0;
32483297
while (true) {
@@ -3315,8 +3364,6 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
33153364
}
33163365
}
33173366

3318-
3319-
33203367
//
33213368
// interface implementation
33223369
//
@@ -3438,7 +3485,7 @@ struct llama_context * llama_new_context_with_model(
34383485
ggml_allocr_free(ctx->alloc);
34393486

34403487
ctx->buf_alloc.resize(alloc_size);
3441-
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
3488+
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.data, ctx->buf_alloc.size, tensor_alignment);
34423489
}
34433490
#else
34443491
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
@@ -3479,11 +3526,11 @@ struct llama_context * llama_new_context_with_model(
34793526

34803527
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
34813528

3482-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
3483-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
3529+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
3530+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
34843531

3485-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
3486-
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
3532+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].data, ctx->buf_scratch[0].size, 0));
3533+
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].data, ctx->buf_scratch[1].size, 0));
34873534
#undef LLAMA_METAL_CHECK_BUF
34883535
}
34893536
#endif
@@ -3565,7 +3612,6 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35653612

35663613
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
35673614

3568-
35693615
// create a temporary ggml context to store the lora tensors
35703616
// todo: calculate size from biggest possible tensor
35713617
std::vector<uint8_t> lora_buf(1024ull * 1024ull * 1024ull);
@@ -3583,11 +3629,10 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35833629
model_tensors.insert(kv);
35843630
}
35853631

3586-
35873632
// load base model
35883633
std::unique_ptr<llama_model_loader> model_loader;
35893634
ggml_context * base_ctx = NULL;
3590-
gguf_buffer base_buf;
3635+
std::vector<uint8_t> base_buf;
35913636
if (path_base_model) {
35923637
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
35933638
model_loader.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
@@ -3598,8 +3643,8 @@ int llama_apply_lora_from_file_internal(const struct llama_model & model, const
35983643
base_buf.resize(ctx_size);
35993644

36003645
ggml_init_params base_params;
3601-
base_params.mem_size = base_buf.size;
3602-
base_params.mem_buffer = base_buf.addr;
3646+
base_params.mem_size = base_buf.size();
3647+
base_params.mem_buffer = base_buf.data();
36033648
base_params.no_alloc = model_loader->use_mmap;
36043649

36053650
base_ctx = ggml_init(base_params);

0 commit comments

Comments
 (0)