Skip to content

Commit 4ca3961

Browse files
authored
Merge pull request #32 from cmp-nct/cuda-performance-broadcast
Cuda performance broadcast
2 parents c4d4d5f + eaf53c4 commit 4ca3961

File tree

9 files changed

+2284
-484
lines changed

9 files changed

+2284
-484
lines changed

examples/falcon/falcon_main.cpp

Lines changed: 7 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -319,14 +319,14 @@ fprintf(stderr, "| %10s | %5s | %4s | %4s | %4s | %4s | %4s | %4s | %4s | %4s |
319319
fprintf(stderr, "+------------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+--------+---------+\n");
320320
fprintf(stderr, "| | %5d | %.3f | %.3f | %.3f | %5d | %.3f | %.3f | %.3f | %.2f | %4d | %.4f | %.5f |\n",
321321
params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty, params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
322-
fprintf(stderr, "+============+=======+=======+=======+=======+=======+=======+====---+-------+------+------+--------+---------+\n");
323-
324-
fprintf(stderr, "| %10s | %7s | %8s | %6s | %6s | %10s |\n",
325-
"Generation", "n_ctx", "n_batch", "n_keep","prompt","seed");
326-
fprintf(stderr, "+------------+---------+----------+--------+--------+------------+\n");
327-
fprintf(stderr, "| | %7d | %8d | %6d | %6zu | %10d |\n",
322+
fprintf(stderr, "+============+=======+=======+=======+=======+=======+=======+-------+-------+------+------+--------+---------+\n");
323+
324+
fprintf(stderr, "| %10s | %5s | %5s | %5s | %5s | %13s |\n",
325+
"Generation", "Ctx", "Batch", "Keep","Prmpt","Seed");
326+
fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\n");
327+
fprintf(stderr, "| | %5d | %5d | %5d | %5zu | %13d |\n",
328328
n_ctx, params.n_batch, params.n_keep, embd_inp.size(),params.seed);
329-
fprintf(stderr, "+------------+---------+----------+--------+--------+------------+\n");
329+
fprintf(stderr, "+------------+-------+-------+-------+-------+---------------+\n");
330330

331331
if (n_ctx < (int)(params.n_predict + embd_inp.size())) {
332332
fprintf(stderr, "%s: Warning: context is smaller than expected generation, will cause delays\n", __func__);
@@ -439,11 +439,6 @@ fprintf(stderr, "+------------+---------+----------+--------+--------+----------
439439
embd.erase(embd.begin(), embd.begin() + i);
440440
}
441441
}
442-
// We have buffers from the warmup run that won't all align with a batched run
443-
#if defined(GGML_USE_CUBLAS)
444-
if (params.n_batch > 1 && embd.size() > 1)
445-
ggml_cuda_pool_free_all(-1);
446-
#endif
447442
// evaluate tokens in batches
448443
// embd is typically prepared beforehand to fit within a batch, but not always
449444
for (int i = 0; i < (int) embd.size(); i += params.n_batch) {
@@ -459,11 +454,6 @@ fprintf(stderr, "+------------+---------+----------+--------+--------+----------
459454
}
460455
n_past += n_eval;
461456
}
462-
#if defined(GGML_USE_CUBLAS)
463-
// frees unused allocations, those during batch processing are of different size than single token eval
464-
if (params.n_batch > 1 && embd.size() > 1)
465-
ggml_cuda_pool_free_all(-1);
466-
#endif
467457
if (embd.size() > 0 && !path_session.empty()) {
468458
session_tokens.insert(session_tokens.end(), embd.begin(), embd.end());
469459
n_session_consumed = session_tokens.size();

examples/falcon_common.cpp

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,22 @@ int32_t get_num_physical_cores() {
5858
return num_physical_cores;
5959
}
6060
#elif defined(_WIN32)
61-
//TODO: Implement
61+
int logical_cores;
62+
SYSTEM_INFO sysinfo;
63+
GetSystemInfo(&sysinfo);
64+
logical_cores = sysinfo.dwNumberOfProcessors;
65+
66+
DWORD_PTR process_affinity_mask;
67+
DWORD_PTR system_affinity_mask;
68+
GetProcessAffinityMask(GetCurrentProcess(), &process_affinity_mask, &system_affinity_mask);
69+
70+
int physical_cores = 0;
71+
for (int i = 0; i < sizeof(DWORD_PTR) * 8; i++) {
72+
if (process_affinity_mask & ((DWORD_PTR)1 << i)) {
73+
physical_cores++;
74+
}
75+
}
76+
return physical_cores;
6277
#endif
6378
unsigned int n_threads = std::thread::hardware_concurrency();
6479
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
@@ -98,6 +113,11 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
98113
#if defined(GGML_USE_CUBLAS)
99114
ggml_cuda_set_max_gpus(LLAMA_MAX_DEVICES); // default
100115
#endif
116+
params.n_threads = get_num_physical_cores();
117+
// until thread scheduling is improved, these numbers are around the optimal (for huge batch processing increase -t manually)
118+
if (params.n_threads > 8) params.n_threads = 4;
119+
if (params.n_threads > 4) params.n_threads = 2;
120+
101121

102122
for (int i = 1; i < argc; i++) {
103123
arg = argv[i];
@@ -245,7 +265,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
245265
break;
246266
}
247267
params.n_batch = std::stoi(argv[i]);
248-
params.n_batch = std::min(512, params.n_batch);
268+
params.n_batch = std::min(1024+128, params.n_batch); // appears to work fine with scratch buffer, keep in eye
249269
} else if (arg == "--keep") {
250270
if (++i >= argc) {
251271
invalid_param = true;
@@ -331,7 +351,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
331351
}
332352
#ifdef GGML_USE_CUBLAS
333353
params.mb_reserve_gpu_main = std::stoi(argv[i]);
334-
ggml_cuda_set_vram_reserved((size_t)params.mb_reserve_gpu_main * 1024*1024);
354+
ggml_cuda_set_vram_reserved(params.mb_reserve_gpu_main * 1024*1024);
335355
#else
336356
fprintf(stderr, "warning: falcon.cpp was compiled without cuBLAS. VRAM not available.\n");
337357
#endif
@@ -383,7 +403,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
383403
params.mem_test = true;
384404
} else if (arg == "--export") {
385405
params.export_cgraph = true;
386-
} else if (arg == "--debug-timings" || arg == "-dt") {
406+
} else if (arg == "--debug-timings" || arg == "--display-timings" || arg == "-dt") {
387407
if (++i >= argc) {
388408
params.debug_timings = 1;
389409
} else
@@ -547,7 +567,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
547567
fprintf(stderr, " -mg i, --main-gpu i the GPU to use for scratch and small tensors (0 = first)\n" );
548568
fprintf(stderr, " --override-max-gpu N\n");
549569
fprintf(stderr, " limits the number of GPUs visible (allows to disable multi/single GPU processing)\n");
550-
fprintf(stderr, " --gpu-reserve-mb-main override reserved VRAM MB for main GPU (defaults to first GPU)\n");
570+
fprintf(stderr, " --gpu-reserve-mb-main override reserved total VRAM MB (can be negative if your driver supports swapping into RAM) \n");
551571
//fprintf(stderr, " --gpu_reserve_mb_other override reserved VRAM MB for other GPUs (for multi GPU systems)\n");
552572
#endif
553573
fprintf(stderr, " --mtest compute maximum memory usage\n");

examples/falcon_common.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,15 @@ int32_t get_num_physical_cores();
2323

2424
struct gpt_params {
2525
int32_t seed = -1; // RNG seed
26-
int32_t n_threads = get_num_physical_cores();
26+
int32_t n_threads = 1;
2727
int32_t n_predict = -1; // new tokens to predict
2828
int32_t n_ctx = 512; // context size
2929
int32_t n_batch = 1; // batch size for prompt processing (must be >=32 to use BLAS)
3030
int32_t n_keep = 0; // number of tokens to keep from initial prompt
3131
int32_t n_gpu_layers = 200; // number of layers to store in VRAM
3232
int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors
3333
float tensor_split[LLAMA_MAX_DEVICES] = {0}; // how split tensors should be distributed across GPUs
34-
int32_t n_max_gpu = 16; // maximum number of GPUs to use
34+
int n_max_gpu = 16; // maximum number of GPUs to use
3535
int32_t mb_reserve_gpu_main = false; // override reserved megabytes of VRAM for the main GPU
3636
// int mb_reserve_gpu_other = false; // override reserved megabytes of VRAM for secondary GPUs
3737

0 commit comments

Comments
 (0)