Skip to content

Commit 4612bf5

Browse files
committed
Leverage mmap for CUDA loading
1 parent 9361803 commit 4612bf5

File tree

3 files changed

+15
-52
lines changed

3 files changed

+15
-52
lines changed

ggml-cuda.cu

Lines changed: 1 addition & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -926,32 +926,13 @@ void ggml_cuda_transform_tensor(ggml_tensor * tensor) {
926926
tensor->backend = GGML_BACKEND_CUDA;
927927
}
928928

929-
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensor, const size_t offset) {
930-
FILE * fp = fopen(fname, "rb");
931-
929+
void ggml_cuda_load_data(const void * buf_host, struct ggml_tensor * tensor) {
932930
const size_t size = ggml_nbytes(tensor);
933931

934932
void * buf;
935933
CUDA_CHECK(cudaMalloc(&buf, size));
936-
void * buf_host = malloc(size);
937-
938-
#ifdef _WIN32
939-
int ret = _fseeki64(fp, (__int64) offset, SEEK_SET);
940-
#else
941-
int ret = fseek(fp, (long) offset, SEEK_SET);
942-
#endif
943-
GGML_ASSERT(ret == 0); // same
944-
945-
size_t ret2 = fread(buf_host, size, 1, fp);
946-
if (ret2 != 1) {
947-
fprintf(stderr, "unexpectedly reached end of file");
948-
exit(1);
949-
}
950934

951935
cudaMemcpy(buf, buf_host, size, cudaMemcpyHostToDevice);
952-
cudaDeviceSynchronize();
953936

954937
tensor->data = buf;
955-
free(buf_host);
956-
fclose(fp);
957938
}

ggml-cuda.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ void * ggml_cuda_host_malloc(size_t size);
1616
void ggml_cuda_host_free(void * ptr);
1717

1818
void ggml_cuda_transform_tensor(struct ggml_tensor * tensor);
19-
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
19+
void ggml_cuda_load_data(const void * data, struct ggml_tensor * tensors);
2020

2121
#ifdef __cplusplus
2222
}

llama.cpp

Lines changed: 13 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -700,28 +700,31 @@ struct llama_model_loader {
700700

701701
if (use_mmap) {
702702
mapping.reset(new llama_mmap(&file_loaders.at(0)->file, prefetch_size));
703-
if (!lmlock) {
704-
// Don't call the callback since the actual loading will be lazy
705-
// and we can't measure it.
706-
progress_callback = NULL;
707-
}
708703
if (lmlock) {
709704
lmlock->init(mapping->addr);
710705
}
711706
}
712707

713708
size_t done_size = 0;
714709
for (llama_load_tensor & lt : tensors_map.tensors) {
715-
if (lt.ggml_tensor->backend != GGML_BACKEND_CPU) {
716-
continue;
717-
}
718710
if (progress_callback) {
719711
progress_callback((float) done_size / data_size, progress_callback_user_data);
720712
}
721713
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
722714
lt.data = (uint8_t *) lt.ggml_tensor->data;
723715
load_data_for(lt);
724-
lt.ggml_tensor->data = lt.data;
716+
switch(lt.ggml_tensor->backend) {
717+
case GGML_BACKEND_CPU:
718+
lt.ggml_tensor->data = lt.data;
719+
break;
720+
#ifdef GGML_USE_CUBLAS
721+
case GGML_BACKEND_CUDA:
722+
ggml_cuda_load_data(lt.data, lt.ggml_tensor);
723+
break;
724+
#endif
725+
default:
726+
continue;
727+
}
725728
done_size += lt.size;
726729
if (use_mmap && lmlock) {
727730
lmlock->grow_to(done_size);
@@ -1106,28 +1109,7 @@ static void llama_model_load_internal(
11061109

11071110
ml->load_all_data(progress_callback, progress_callback_user_data, use_mlock ? &lctx.model.mlock_mmap : NULL);
11081111

1109-
#ifdef GGML_USE_CUBLAS
1110-
{
1111-
size_t done_size = 0;
1112-
size_t data_size = 0;
1113-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1114-
data_size += lt.size;
1115-
if (lt.ggml_tensor->backend == GGML_BACKEND_CPU) {
1116-
done_size += lt.size;
1117-
}
1118-
}
1119-
for (llama_load_tensor & lt : ml->tensors_map.tensors) {
1120-
if (lt.ggml_tensor->backend != GGML_BACKEND_CUDA) {
1121-
continue;
1122-
}
1123-
if (progress_callback) {
1124-
progress_callback((float) done_size / data_size, progress_callback_user_data);
1125-
}
1126-
ggml_cuda_load_data(fname.c_str(), lt.ggml_tensor, lt.shards.at(0).file_off);
1127-
done_size += lt.size;
1128-
}
1129-
}
1130-
#elif defined(GGML_USE_CLBLAST)
1112+
#ifdef GGML_USE_CLBLAST
11311113
{
11321114
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
11331115

0 commit comments

Comments
 (0)