@@ -5137,12 +5137,17 @@ static bool llm_load_tensors(
5137
5137
ml.init_mappings(true, &model.mlock_mmaps);
5138
5138
5139
5139
// create the backend buffers
5140
- std::vector<std::pair<ggml_context *, std::map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
5140
+ std::vector<std::pair<ggml_context *, std::unordered_map<uint32_t, ggml_backend_buffer_t>>> ctx_bufs;
5141
+
5142
+ // Ensure we have enough capacity for the maximum backend buffer we will potentially create
5143
+ size_t n_max_backend_buffer = ctx_map.size() * ml.files.size();
5144
+ model.bufs.reserve(n_max_backend_buffer);
5141
5145
5142
5146
for (auto & it : ctx_map) {
5143
5147
ggml_backend_buffer_type_t buft = it.first;
5144
5148
ggml_context * ctx = it.second;
5145
- std::map<uint32_t, ggml_backend_buffer_t> bufs;
5149
+ std::unordered_map<uint32_t, ggml_backend_buffer_t> bufs;
5150
+ bufs.reserve(n_max_backend_buffer);
5146
5151
5147
5152
// only the mmap region containing the tensors in the model is mapped to the backend buffer
5148
5153
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
@@ -5159,6 +5164,7 @@ static bool llm_load_tensors(
5159
5164
if (buf == nullptr) {
5160
5165
throw std::runtime_error("unable to allocate backend CPU buffer");
5161
5166
}
5167
+ model.bufs.push_back(buf);
5162
5168
bufs.emplace(idx, buf);
5163
5169
#ifdef GGML_USE_CUBLAS
5164
5170
if (n_layer >= n_gpu_layers) {
@@ -5183,6 +5189,7 @@ static bool llm_load_tensors(
5183
5189
if (buf == nullptr) {
5184
5190
throw std::runtime_error("unable to allocate backend metal buffer");
5185
5191
}
5192
+ model.bufs.push_back(buf);
5186
5193
bufs.emplace(idx, buf);
5187
5194
}
5188
5195
}
@@ -5192,6 +5199,7 @@ static bool llm_load_tensors(
5192
5199
if (buf == nullptr) {
5193
5200
throw std::runtime_error("unable to allocate backend buffer");
5194
5201
}
5202
+ model.bufs.push_back(buf);
5195
5203
if (use_mlock && ggml_backend_buffer_is_host(buf)) {
5196
5204
model.mlock_bufs.emplace_back(new llama_mlock);
5197
5205
auto & mlock_buf = model.mlock_bufs.back();
@@ -5209,7 +5217,6 @@ static bool llm_load_tensors(
5209
5217
// indicate that this buffer contains weights
5210
5218
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
5211
5219
ggml_backend_buffer_set_usage(buf.second, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
5212
- model.bufs.push_back(buf.second);
5213
5220
}
5214
5221
5215
5222
ctx_bufs.emplace_back(ctx, bufs);
0 commit comments