Skip to content

Commit 2dd6dee

Browse files
committed
cuBLAS: use host pinned memory
1 parent d3fd04e commit 2dd6dee

File tree

6 files changed

+52
-9
lines changed

6 files changed

+52
-9
lines changed

Makefile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ ifdef LLAMA_OPENBLAS
106106
endif
107107
ifdef LLAMA_CUBLAS
108108
CFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
109+
CXXFLAGS += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
109110
LDFLAGS += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
110111
OBJS += ggml-cuda.o
111112
NVCC = nvcc
@@ -164,10 +165,10 @@ $(info )
164165
# Build library
165166
#
166167

167-
ggml.o: ggml.c ggml.h
168+
ggml.o: ggml.c ggml.h ggml-cuda.h
168169
$(CC) $(CFLAGS) -c $< -o $@
169170

170-
llama.o: llama.cpp ggml.h llama.h llama_util.h
171+
llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h
171172
$(CXX) $(CXXFLAGS) -c $< -o $@
172173

173174
common.o: examples/common.cpp examples/common.h

ggml-cuda.cu

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -353,3 +353,13 @@ cudaError_t ggml_cuda_h2d_tensor_2d(void * dst, const struct ggml_tensor * src,
353353
return cudaSuccess;
354354
}
355355
}
356+
357+
void * ggml_cuda_host_malloc(size_t size) {
358+
void * ptr;
359+
CUDA_CHECK(cudaMallocHost((void **) &ptr, size));
360+
return ptr;
361+
}
362+
363+
void ggml_cuda_host_free(void * ptr) {
364+
CUDA_CHECK(cudaFreeHost(ptr));
365+
}

ggml-cuda.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,9 @@ extern cudaStream_t g_cudaStream2;
3131
extern cudaEvent_t g_cudaEvent;
3232

3333
void ggml_init_cublas(void);
34+
void * ggml_cuda_host_malloc(size_t size);
35+
void ggml_cuda_host_free(void * ptr);
36+
3437
void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size);
3538
void ggml_cuda_pool_free(void * ptr, size_t size);
3639

ggml.c

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8235,8 +8235,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82358235
}
82368236

82378237
#if defined(GGML_USE_CUBLAS)
8238-
ggml_fp16_t * const wdata = params->wdata;
8239-
82408238
const float alpha = 1.0f;
82418239
const float beta = 0.0f;
82428240
const int x_ne = ne01 * ne00;
@@ -8254,6 +8252,7 @@ static void ggml_compute_forward_mul_mat_f16_f32(
82548252
for (int64_t i02 = 0; i02 < ne02; i02++) {
82558253
#if defined(GGML_USE_CUBLAS)
82568254
// with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8255+
ggml_fp16_t * const wdata = (ggml_fp16_t *) params->wdata + (ne11 * ne10) * (i03 * ne02 + i02);
82578256
{
82588257
size_t id = 0;
82598258
for (int64_t i01 = 0; i01 < ne11; ++i01) {
@@ -8540,7 +8539,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
85408539
const float * x = wdata;
85418540
#endif
85428541

8543-
85448542
#if defined(GGML_USE_CUBLAS)
85458543
// copy data to device
85468544
CUDA_CHECK(ggml_cuda_h2d_tensor_2d(d_Y, src1, i03, i02, g_cudaStream));
@@ -11571,7 +11569,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1157111569
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
1157211570
node->n_tasks = 1; // TODO: this actually is doing nothing
1157311571
// the threads are still spinning
11574-
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
11572+
cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0));
1157511573
//printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
1157611574
//printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
1157711575
//printf("cur = %zu\n", cur);
@@ -11583,6 +11581,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
1158311581
#endif
1158411582
} else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
1158511583
cur = 0;
11584+
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS)
11585+
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11586+
node->n_tasks = 1;
11587+
}
11588+
#endif
1158611589
} else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
1158711590
#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1158811591
if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {

llama.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -167,7 +167,7 @@ struct llama_model {
167167
struct llama_kv_cache kv_self;
168168

169169
// the model memory buffer
170-
llama_buffer buf;
170+
llama_ctx_buffer buf;
171171

172172
// model memory mapped file
173173
std::unique_ptr<llama_mmap> mapping;
@@ -228,8 +228,8 @@ struct llama_context {
228228

229229
// memory buffers used to evaluate the model
230230
// TODO: move in llama_state
231-
llama_buffer buf_compute;
232-
llama_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
231+
llama_ctx_buffer buf_compute;
232+
llama_ctx_buffer buf_scratch[LLAMA_MAX_SCRATCH_BUFFERS];
233233

234234
int buf_last = 0;
235235
size_t buf_max_size[LLAMA_MAX_SCRATCH_BUFFERS] = { 0 };

llama_util.h

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -405,4 +405,30 @@ struct llama_buffer {
405405
delete[] addr;
406406
}
407407
};
408+
409+
#ifdef GGML_USE_CUBLAS
410+
#include "ggml-cuda.h"
411+
struct llama_ctx_buffer {
412+
uint8_t * addr = NULL;
413+
size_t size = 0;
414+
415+
void resize(size_t size) {
416+
if (addr) {
417+
ggml_cuda_host_free(addr);
418+
}
419+
addr = (uint8_t *) ggml_cuda_host_malloc(size);
420+
this->size = size;
421+
}
422+
423+
~llama_ctx_buffer() {
424+
if (addr) {
425+
ggml_cuda_host_free(addr);
426+
}
427+
}
428+
};
429+
#else
430+
typedef llama_buffer llama_ctx_buffer;
431+
#endif
432+
433+
408434
#endif

0 commit comments

Comments
 (0)