Skip to content

Commit 24d5ddf

Browse files
fixup! GPU weights not in RAM, direct loading with cuFile
1 parent 1bfe5a9 commit 24d5ddf

File tree

2 files changed

+9
-5
lines changed

2 files changed

+9
-5
lines changed

llama-util.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ struct llama_mmap {
172172
#ifdef _POSIX_MAPPED_FILES
173173
static constexpr bool SUPPORTED = true;
174174

175-
llama_mmap(struct llama_file * file, bool prefetch = true) {
175+
llama_mmap(struct llama_file * file, size_t prefetch = (size_t) -1 /* -1 = max value */) {
176176
size = file->size;
177177
int fd = fileno(file->fp);
178178
int flags = MAP_SHARED;
@@ -184,9 +184,9 @@ struct llama_mmap {
184184
throw std::runtime_error(format("mmap failed: %s", strerror(errno)));
185185
}
186186

187-
if (prefetch) {
187+
if (prefetch > 0) {
188188
// Advise the kernel to preload the mapped memory
189-
if (madvise(addr, file->size, MADV_WILLNEED)) {
189+
if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
190190
fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
191191
strerror(errno));
192192
}

llama.cpp

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -679,12 +679,16 @@ struct llama_model_loader {
679679

680680
void load_all_data(llama_progress_callback progress_callback, void * progress_callback_user_data, llama_mlock * lmlock) {
681681
size_t data_size = 0;
682+
size_t prefetch_size = 0;
682683
for (const llama_load_tensor & lt : tensors_map.tensors) {
683684
data_size += lt.size;
685+
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
686+
prefetch_size += lt.size;
687+
}
684688
}
685689

686690
if (use_mmap) {
687-
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, false));
691+
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
688692
if (!lmlock) {
689693
// Don't call the callback since the actual loading will be lazy
690694
// and we can't measure it.
@@ -2317,7 +2321,7 @@ int llama_apply_lora_from_file_internal(struct llama_context * ctx, const char *
23172321

23182322
// maybe this should in llama_model_loader
23192323
if (model_loader->use_mmap) {
2320-
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ false));
2324+
model_loader->mapping.reset(new llama_mmap(&model_loader->file_loaders.at(0)->file, /* prefetch */ 0));
23212325
}
23222326
}
23232327

0 commit comments

Comments
 (0)