Skip to content

Commit e9b7a5c

Browse files
committed
llama : use n_threads_batch only when n_tokens >= 32
ggml-ci
1 parent f815fe4 commit e9b7a5c

File tree

1 file changed

+2
-12
lines changed

1 file changed

+2
-12
lines changed

llama.cpp

Lines changed: 2 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -5433,7 +5433,7 @@ static int llama_decode_internal(
54335433

54345434
GGML_ASSERT(n_tokens <= n_batch);
54355435

5436-
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
5436+
int n_threads = n_tokens < 32 ? cparams.n_threads : cparams.n_threads_batch;
54375437
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
54385438

54395439
const int64_t t_start_us = ggml_time_us();
@@ -5550,18 +5550,8 @@ static int llama_decode_internal(
55505550
n_threads = std::min(4, n_threads);
55515551
}
55525552

5553-
// If all tensors can be run on the GPU then using more than 1 thread is detrimental.
5554-
const bool full_offload_supported =
5555-
model.arch == LLM_ARCH_LLAMA ||
5556-
model.arch == LLM_ARCH_BAICHUAN ||
5557-
model.arch == LLM_ARCH_FALCON ||
5558-
model.arch == LLM_ARCH_REFACT ||
5559-
model.arch == LLM_ARCH_MPT ||
5560-
model.arch == LLM_ARCH_STARCODER ||
5561-
model.arch == LLM_ARCH_STABLELM;
5562-
55635553
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
5564-
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
5554+
if (ggml_cpu_has_cublas() && fully_offloaded) {
55655555
n_threads = 1;
55665556
}
55675557

0 commit comments

Comments
 (0)