Skip to content

Commit ad7951c

Browse files
CUDA scratch proportional to batch size
1 parent dcbb182 commit ad7951c

File tree

5 files changed

+34
-34
lines changed

5 files changed

+34
-34
lines changed

examples/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -510,6 +510,7 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
510510
auto lparams = llama_context_default_params();
511511

512512
lparams.n_ctx = params.n_ctx;
513+
lparams.n_batch = params.n_batch;
513514
lparams.n_gpu_layers = params.n_gpu_layers;
514515
memcpy(lparams.tensor_split, params.tensor_split, LLAMA_MAX_DEVICES*sizeof(float));
515516
lparams.seed = params.seed;

ggml-cuda.cu

Lines changed: 15 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -534,12 +534,12 @@ static void ggml_cuda_pool_free(void * ptr, size_t size) {
534534
CUDA_CHECK(cudaFree(ptr));
535535
}
536536

537-
#define GGML_CUDA_MAX_SCRATCH_BUFFERS 16
538-
#define GGML_CUDA_SCRATCH_SIZE 536870912 // 512 MB
539-
//#define GGML_CUDA_SCRATCH_SIZE 1073741824 // 1 GB
540-
//#define GGML_CUDA_SCRATCH_SIZE 4294967296 // 4 GB
541-
static void * g_scratch_buffers[GGML_CUDA_MAX_DEVICES][GGML_CUDA_MAX_SCRATCH_BUFFERS] = {nullptr};
542-
static int g_scratch_index = 0;
537+
538+
#define GGML_CUDA_SCRATCH_SIZE_PER_BATCH 1048576 // 1 MB
539+
//#define GGML_CUDA_SCRATCH_SIZE_PER_BATCH 2097152 // 2 MB
540+
541+
static void * g_scratch_buffer = nullptr;
542+
static int g_n_batch = 512;
543543
static size_t g_scratch_offset = 0;
544544

545545
#define GGML_CUDA_MAX_STREAMS 8 // Set this to 1 for reproducible matrix multiplication.
@@ -1288,8 +1288,9 @@ void ggml_cuda_free_data(struct ggml_tensor * tensor) {
12881288

12891289
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
12901290
const size_t size = ggml_nbytes(tensor);
1291-
GGML_ASSERT(size <= GGML_CUDA_SCRATCH_SIZE);
1292-
if (g_scratch_offset + size > GGML_CUDA_SCRATCH_SIZE) {
1291+
const size_t scratch_size = g_n_batch * GGML_CUDA_SCRATCH_SIZE_PER_BATCH;
1292+
GGML_ASSERT(size <= scratch_size);
1293+
if (g_scratch_offset + size > scratch_size) {
12931294
g_scratch_offset = 0;
12941295
}
12951296

@@ -1304,10 +1305,10 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
13041305
extra->data_device[g_main_device] = src0_extra->data_device;
13051306
GGML_ASSERT(false);
13061307
} else {
1307-
char * data = (char *) g_scratch_buffers[g_main_device][g_scratch_index];
1308+
char * data = (char *) g_scratch_buffer;
13081309
if (data == nullptr) {
1309-
CUDA_CHECK(cudaMalloc(&data, GGML_CUDA_SCRATCH_SIZE));
1310-
g_scratch_buffers[g_main_device][g_scratch_index] = data;
1310+
CUDA_CHECK(cudaMalloc(&data, scratch_size));
1311+
g_scratch_buffer = data;
13111312
}
13121313
extra->data_device[g_main_device] = data + g_scratch_offset;
13131314
}
@@ -1317,20 +1318,12 @@ void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
13171318
// fprintf(stderr, "%s: scratch %d, %p - %p\n",
13181319
// tensor->name, g_scratch_index, data + g_scratch_offset, data + g_scratch_offset + size);
13191320

1320-
GGML_ASSERT(g_scratch_offset <= GGML_CUDA_SCRATCH_SIZE);
1321+
GGML_ASSERT(g_scratch_offset <= scratch_size);
13211322
tensor->extra = extra;
13221323
}
13231324

1324-
void ggml_cuda_set_scratch(int i) {
1325-
if (i == -1) {
1326-
return;
1327-
}
1328-
#if false
1329-
fprintf(stderr, "\n%s: switched scratch %d -> %d, old scratch used %.2f MB\n",
1330-
__func__, g_scratch_index, i, g_scratch_offset/1024.0f/1024.0f);
1331-
#endif
1332-
g_scratch_index = i;
1333-
g_scratch_offset = 0;
1325+
void ggml_cuda_set_n_batch(int n_batch) {
1326+
g_n_batch = n_batch;
13341327
}
13351328

13361329
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){

ggml-cuda.h

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ struct ggml_tensor_extra_gpu {
1313
};
1414

1515
void ggml_init_cublas(void);
16-
void ggml_cuda_set_tensor_split(const float * tensor_split);
16+
void ggml_cuda_set_tensor_split(const float * tensor_split);
1717

1818
void ggml_cuda_mul(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
1919
bool ggml_cuda_can_mul_mat(const struct ggml_tensor * src0, const struct ggml_tensor * src1, struct ggml_tensor * dst);
@@ -24,11 +24,11 @@ void ggml_cuda_mul_mat(const struct ggml_tensor * src0, const struct ggml_tens
2424
void * ggml_cuda_host_malloc(size_t size);
2525
void ggml_cuda_host_free(void * ptr);
2626

27-
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
28-
void ggml_cuda_free_data(struct ggml_tensor * tensor);
29-
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
30-
void ggml_cuda_set_scratch(int i);
31-
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
27+
void ggml_cuda_load_data(const char * fname, struct ggml_tensor * tensors, size_t offset);
28+
void ggml_cuda_free_data(struct ggml_tensor * tensor);
29+
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
30+
void ggml_cuda_set_n_batch(int n_batch);
31+
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor);
3232

3333
#ifdef __cplusplus
3434
}

llama.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -849,6 +849,7 @@ static bool kv_cache_init(
849849
struct llama_context_params llama_context_default_params() {
850850
struct llama_context_params result = {
851851
/*.n_ctx =*/ 512,
852+
/*.n_batch =*/ 512,
852853
/*.gpu_layers =*/ 0,
853854
/*.tensor_split =*/ {0},
854855
/*.seed =*/ -1,
@@ -934,6 +935,7 @@ static void llama_model_load_internal(
934935
const std::string & fname,
935936
llama_context & lctx,
936937
int n_ctx,
938+
int n_batch,
937939
int n_gpu_layers,
938940
const float * tensor_split,
939941
ggml_type memory_type,
@@ -1142,6 +1144,7 @@ static void llama_model_load_internal(
11421144

11431145
#if defined(GGML_USE_CUBLAS)
11441146
{
1147+
ggml_cuda_set_n_batch(n_batch);
11451148
ggml_cuda_set_tensor_split(tensor_split);
11461149

11471150
size_t done_size = 0;
@@ -1186,6 +1189,7 @@ static void llama_model_load_internal(
11861189
}
11871190
}
11881191
#else
1192+
(void) n_batch;
11891193
(void) tensor_split;
11901194
#endif
11911195

@@ -1204,6 +1208,7 @@ static bool llama_model_load(
12041208
const std::string & fname,
12051209
llama_context & lctx,
12061210
int n_ctx,
1211+
int n_batch,
12071212
int n_gpu_layers,
12081213
float * tensor_split,
12091214
ggml_type memory_type,
@@ -1213,7 +1218,7 @@ static bool llama_model_load(
12131218
llama_progress_callback progress_callback,
12141219
void *progress_callback_user_data) {
12151220
try {
1216-
llama_model_load_internal(fname, lctx, n_ctx, n_gpu_layers, tensor_split, memory_type, use_mmap,
1221+
llama_model_load_internal(fname, lctx, n_ctx, n_batch, n_gpu_layers, tensor_split, memory_type, use_mmap,
12171222
use_mlock, vocab_only, progress_callback, progress_callback_user_data);
12181223
return true;
12191224
} catch (const std::string & err) {
@@ -2375,8 +2380,8 @@ struct llama_context * llama_init_from_file(
23752380

23762381
ggml_type memory_type = params.f16_kv ? GGML_TYPE_F16 : GGML_TYPE_F32;
23772382

2378-
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_gpu_layers, params.tensor_split,
2379-
memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
2383+
if (!llama_model_load(path_model, *ctx, params.n_ctx, params.n_batch, params.n_gpu_layers,
2384+
params.tensor_split, memory_type, params.use_mmap, params.use_mlock, params.vocab_only,
23802385
params.progress_callback, params.progress_callback_user_data)) {
23812386
fprintf(stderr, "%s: failed to load model\n", __func__);
23822387
llama_free(ctx);

llama.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,11 @@ extern "C" {
7272
typedef void (*llama_progress_callback)(float progress, void *ctx);
7373

7474
struct llama_context_params {
75-
int n_ctx; // text context
76-
int n_gpu_layers; // number of layers to store in VRAM
75+
int n_ctx; // text context
76+
int n_batch; // prompt processing batch size
77+
int n_gpu_layers; // number of layers to store in VRAM
7778
float tensor_split[LLAMA_MAX_DEVICES]; // how to split layers across multiple GPUs
78-
int seed; // RNG seed, -1 for random
79+
int seed; // RNG seed, -1 for random
7980

8081
bool f16_kv; // use fp16 for KV cache
8182
bool logits_all; // the llama_eval() call computes all logits, not just the last one

0 commit comments

Comments
 (0)