Skip to content

Commit a9e88c6

Browse files
committed
llama_model_loader: immediately add the backend buffer to the model buffers in order to free them if an error occurs in the next allocation. Reserve the expected size.
1 parent ec372c6 commit a9e88c6

File tree

1 file changed

+10
-3
lines changed

1 file changed

+10
-3
lines changed

llama.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5137,12 +5137,17 @@ static bool llm_load_tensors(
51375137
ml.init_mappings(true, &model.mlock_mmaps);
51385138

51395139
// create the backend buffers
5140-
std::vector<std::pair<ggml_context *, std::map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
5140+
std::vector<std::pair<ggml_context *, std::unordered_map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
5141+
5142+
// Ensure we have enough capacity for the maximum backend buffer we will potentially create
5143+
size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
5144+
model.bufs.reserve(n_max_backend_buffer);
51415145

51425146
for (auto & it : ctx_map) {
51435147
ggml_backend_buffer_type_t buft = it.first;
51445148
ggml_context * ctx = it.second;
5145-
std::map<uint32_t, ggml_backend_buffer_t> bufs;
5149+
std::unordered_map<uint32_t, ggml_backend_buffer_t> bufs;
5150+
bufs.reserve(n_max_backend_buffer);
51465151

51475152
// only the mmap region containing the tensors in the model is mapped to the backend buffer
51485153
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@@ -5159,6 +5164,7 @@ static bool llm_load_tensors(
51595164
if (buf == nullptr) {
51605165
throw std::runtime_error("unable to allocate backend CPU buffer");
51615166
}
5167+
model.bufs.push_back(buf);
51625168
bufs.emplace(idx, buf);
51635169
#ifdef GGML_USE_CUBLAS
51645170
if (n_layer >= n_gpu_layers) {
@@ -5183,6 +5189,7 @@ static bool llm_load_tensors(
51835189
if (buf == nullptr) {
51845190
throw std::runtime_error("unable to allocate backend metal buffer");
51855191
}
5192+
model.bufs.push_back(buf);
51865193
bufs.emplace(idx, buf);
51875194
}
51885195
}
@@ -5192,6 +5199,7 @@ static bool llm_load_tensors(
51925199
if (buf == nullptr) {
51935200
throw std::runtime_error("unable to allocate backend buffer");
51945201
}
5202+
model.bufs.push_back(buf);
51955203
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
51965204
model.mlock_bufs.emplace_back(new llama_mlock);
51975205
auto & mlock_buf = model.mlock_bufs.back();
@@ -5209,7 +5217,6 @@ static bool llm_load_tensors(
52095217
// indicate that this buffer contains weights
52105218
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
52115219
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5212-
model.bufs.push_back(buf.second);
52135220
}
52145221

52155222
ctx_bufs.emplace_back(ctx, bufs);

0 commit comments

Comments
 (0)