@@ -1787,7 +1787,6 @@ static struct ggml_cgraph * llama_build_graph(
1787
1787
// - n_tokens number of tokens
1788
1788
// - n_past: the context size so far
1789
1789
// - n_threads: number of threads to use for inference
1790
- // - pp_threads: number of threads to use for prompt processing
1791
1790
//
1792
1791
static bool llama_eval_internal (
1793
1792
llama_context & lctx,
@@ -1796,7 +1795,6 @@ static bool llama_eval_internal(
1796
1795
int n_tokens,
1797
1796
int n_past,
1798
1797
int n_threads,
1799
- int pp_threads,
1800
1798
const char * cgraph_fname) {
1801
1799
1802
1800
LLAMA_ASSERT ((!tokens && embd) || (tokens && !embd));
@@ -1840,8 +1838,7 @@ static bool llama_eval_internal(
1840
1838
1841
1839
// for big prompts, if BLAS is enabled, it is better to use only one thread
1842
1840
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1843
- pp_threads = N >= 32 && ggml_cpu_has_blas () && !ggml_cpu_has_gpublas () ? 1 : pp_threads;
1844
- n_threads = N > 1 ? pp_threads : n_threads;
1841
+ n_threads = N >= 32 && ggml_cpu_has_blas () && !ggml_cpu_has_gpublas () ? 1 : n_threads;
1845
1842
1846
1843
struct ggml_tensor * res = gf->nodes [gf->n_nodes - 1 ];
1847
1844
struct ggml_tensor * embeddings = gf->nodes [gf->n_nodes - 2 ];
@@ -3487,7 +3484,7 @@ struct llama_context * llama_new_context_with_model(
3487
3484
if (ggml_mpi_rank (ctx->ctx_mpi ) > 0 ) {
3488
3485
// Enter a blocking eval loop with dummy input, letting rank=0 drive the process
3489
3486
const std::vector<llama_token> tmp (ctx->model .hparams .n_ctx , llama_token_bos ());
3490
- while (!llama_eval (ctx, tmp.data (), tmp.size (), 0 , 0 , 0 )) {};
3487
+ while (!llama_eval (ctx, tmp.data (), tmp.size (), 0 , 0 )) {};
3491
3488
llama_backend_free ();
3492
3489
exit (1 );
3493
3490
}
@@ -4179,9 +4176,8 @@ int llama_eval(
4179
4176
const llama_token * tokens,
4180
4177
int n_tokens,
4181
4178
int n_past,
4182
- int n_threads,
4183
- int pp_threads) {
4184
- if (!llama_eval_internal (*ctx, tokens, nullptr , n_tokens, n_past, n_threads, pp_threads, nullptr )) {
4179
+ int n_threads) {
4180
+ if (!llama_eval_internal (*ctx, tokens, nullptr , n_tokens, n_past, n_threads, nullptr )) {
4185
4181
LLAMA_LOG_ERROR (" %s: failed to eval\n " , __func__);
4186
4182
return 1 ;
4187
4183
}
@@ -4202,9 +4198,8 @@ int llama_eval_embd(
4202
4198
const float * embd,
4203
4199
int n_tokens,
4204
4200
int n_past,
4205
- int n_threads,
4206
- int pp_threads) {
4207
- if (!llama_eval_internal (*ctx, nullptr , embd, n_tokens, n_past, n_threads, pp_threads, nullptr )) {
4201
+ int n_threads) {
4202
+ if (!llama_eval_internal (*ctx, nullptr , embd, n_tokens, n_past, n_threads, nullptr )) {
4208
4203
LLAMA_LOG_ERROR (" %s: failed to eval\n " , __func__);
4209
4204
return 1 ;
4210
4205
}
@@ -4225,7 +4220,7 @@ int llama_eval_export(struct llama_context * ctx, const char * fname) {
4225
4220
4226
4221
const std::vector<llama_token> tmp (n_batch, llama_token_bos ());
4227
4222
4228
- if (!llama_eval_internal (*ctx, tmp.data (), nullptr , tmp.size (), n_ctx, 1 , 1 , fname)) {
4223
+ if (!llama_eval_internal (*ctx, tmp.data (), nullptr , tmp.size (), n_ctx, 1 , fname)) {
4229
4224
LLAMA_LOG_ERROR (" %s: failed to eval\n " , __func__);
4230
4225
return 1 ;
4231
4226
}
0 commit comments