@@ -319,14 +319,14 @@ fprintf(stderr, "| %10s | %5s | %4s | %4s | %4s | %4s | %4s | %4s | %4s | %4s |
319
319
fprintf (stderr, " +------------+-------+-------+-------+-------+-------+-------+-------+-------+------+------+--------+---------+\n " );
320
320
fprintf (stderr, " | | %5d | %.3f | %.3f | %.3f | %5d | %.3f | %.3f | %.3f | %.2f | %4d | %.4f | %.5f |\n " ,
321
321
params.repeat_last_n , params.repeat_penalty , params.presence_penalty , params.frequency_penalty , params.top_k , params.tfs_z , params.top_p , params.typical_p , params.temp , params.mirostat , params.mirostat_eta , params.mirostat_tau );
322
- fprintf (stderr, " +============+=======+=======+=======+=======+=======+=======+==== ---+-------+------+------+--------+---------+\n " );
323
-
324
- fprintf (stderr, " | %10s | %7s | %8s | %6s | %6s | %10s |\n " ,
325
- " Generation" , " n_ctx " , " n_batch " , " n_keep " ," prompt " ," seed " );
326
- fprintf (stderr, " +------------+--------- +---------- +-------- +-------- +------------+\n " );
327
- fprintf (stderr, " | | %7d | %8d | %6d | %6zu | %10d |\n " ,
322
+ fprintf (stderr, " +============+=======+=======+=======+=======+=======+=======+---- ---+-------+------+------+--------+---------+\n " );
323
+
324
+ fprintf (stderr, " | %10s | %5s | %5s | %5s | %5s | %13s |\n " ,
325
+ " Generation" , " Ctx " , " Batch " , " Keep " ," Prmpt " ," Seed " );
326
+ fprintf (stderr, " +------------+-------+-------+-------+-------+--------------- +\n " );
327
+ fprintf (stderr, " | | %5d | %5d | %5d | %5zu | %13d |\n " ,
328
328
n_ctx, params.n_batch , params.n_keep , embd_inp.size (),params.seed );
329
- fprintf (stderr, " +------------+--------- +---------- +-------- +-------- +------------+\n " );
329
+ fprintf (stderr, " +------------+-------+-------+-------+-------+--------------- +\n " );
330
330
331
331
if (n_ctx < (int )(params.n_predict + embd_inp.size ())) {
332
332
fprintf (stderr, " %s: Warning: context is smaller than expected generation, will cause delays\n " , __func__);
@@ -439,11 +439,6 @@ fprintf(stderr, "+------------+---------+----------+--------+--------+----------
439
439
embd.erase (embd.begin (), embd.begin () + i);
440
440
}
441
441
}
442
- // We have buffers from the warmup run that won't all align with a batched run
443
- #if defined(GGML_USE_CUBLAS)
444
- if (params.n_batch > 1 && embd.size () > 1 )
445
- ggml_cuda_pool_free_all (-1 );
446
- #endif
447
442
// evaluate tokens in batches
448
443
// embd is typically prepared beforehand to fit within a batch, but not always
449
444
for (int i = 0 ; i < (int ) embd.size (); i += params.n_batch ) {
@@ -459,11 +454,6 @@ fprintf(stderr, "+------------+---------+----------+--------+--------+----------
459
454
}
460
455
n_past += n_eval;
461
456
}
462
- #if defined(GGML_USE_CUBLAS)
463
- // frees unused allocations, those during batch processing are of different size than single token eval
464
- if (params.n_batch > 1 && embd.size () > 1 )
465
- ggml_cuda_pool_free_all (-1 );
466
- #endif
467
457
if (embd.size () > 0 && !path_session.empty ()) {
468
458
session_tokens.insert (session_tokens.end (), embd.begin (), embd.end ());
469
459
n_session_consumed = session_tokens.size ();
0 commit comments