@@ -1663,13 +1663,24 @@ struct llama_model {
1663
1663
struct llama_context {
1664
1664
llama_context (const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1665
1665
~llama_context () {
1666
+ for (auto & it : bufs_compute) {
1667
+ // restore the original buffer in the tallocr
1668
+ ggml_tallocr_t allocr = ggml_backend_sched_get_tallocr (sched, it.first );
1669
+ ggml_tallocr_set_buffer (allocr, it.second [0 ]);
1670
+ // free the rest of the buffers
1671
+ for (size_t i = 1 ; i < it.second .size (); ++i) {
1672
+ ggml_backend_buffer_free (it.second [i]);
1673
+ }
1674
+ }
1675
+
1666
1676
ggml_backend_sched_free (sched);
1667
1677
1668
1678
for (ggml_backend_t backend : backends) {
1669
1679
ggml_backend_free (backend);
1670
1680
}
1671
1681
1672
1682
ggml_backend_buffer_free (buf_logits);
1683
+
1673
1684
}
1674
1685
1675
1686
llama_cparams cparams;
@@ -1719,10 +1730,11 @@ struct llama_context {
1719
1730
std::vector<uint8_t > buf_compute_meta;
1720
1731
ggml_backend_sched_t sched = nullptr ;
1721
1732
// allocator for the input tensors
1722
- ggml_tallocr * alloc_cpu = nullptr ;
1733
+ ggml_tallocr_t alloc_cpu = nullptr ;
1723
1734
1724
- std::vector<ggml_backend_buffer_t > buf_cpu_ub;
1725
- size_t buf_cpu_ub_cur = 0 ;
1735
+ std::map<ggml_backend_t , std::vector<ggml_backend_buffer_t >> bufs_compute;
1736
+ size_t n_compute_bufs = 0 ;
1737
+ size_t i_compute_buf = 0 ;
1726
1738
1727
1739
// temporary buffer for copying data to/from the backend
1728
1740
std::vector<no_init<uint8_t >> buf_copy;
@@ -6704,15 +6716,17 @@ static int llama_decode_internal(
6704
6716
// printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
6705
6717
6706
6718
// change the CPU compute buffer to avoid overwriting inputs
6707
- size_t buf_cpu_ub_cur = lctx.buf_cpu_ub_cur ;
6708
- lctx.buf_cpu_ub_cur = (lctx.buf_cpu_ub_cur + 1 ) % lctx.buf_cpu_ub . size () ;
6709
- if (buf_cpu_ub_cur == 0 && cur_token > 0 ) {
6719
+ size_t i_compute_buf = lctx.i_compute_buf ;
6720
+ lctx.i_compute_buf = (lctx.i_compute_buf + 1 ) % lctx.n_compute_bufs ;
6721
+ if (i_compute_buf == 0 && cur_token > 0 ) {
6710
6722
// sync all backends to ensure that the current buffer is not in use
6711
6723
printf (" not enough buffers, syncing now\n " );
6712
6724
ggml_backend_sched_synchronize (lctx.sched );
6713
6725
}
6714
-
6715
- ggml_tallocr_set_buffer (lctx.alloc_cpu , lctx.buf_cpu_ub .at (buf_cpu_ub_cur));
6726
+ for (auto it : lctx.bufs_compute ) {
6727
+ ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr (lctx.sched , it.first );
6728
+ ggml_tallocr_set_buffer (alloc, it.second .at (i_compute_buf));
6729
+ }
6716
6730
6717
6731
ggml_backend_sched_reset (lctx.sched );
6718
6732
@@ -6833,7 +6847,7 @@ static int llama_decode_internal(
6833
6847
}
6834
6848
6835
6849
ggml_backend_sched_synchronize (lctx.sched );
6836
- lctx.buf_cpu_ub_cur = 0 ;
6850
+ lctx.i_compute_buf = 0 ;
6837
6851
6838
6852
// measure the performance only for the single-token evals
6839
6853
if (n_tokens_all == 1 ) {
@@ -10003,14 +10017,26 @@ struct llama_context * llama_new_context_with_model(
10003
10017
ctx->alloc_cpu = ggml_backend_sched_get_tallocr (ctx->sched , ctx->backend_cpu );
10004
10018
10005
10019
// duplicate cpu buffers for microbatching
10006
- ggml_backend_buffer_t buf_cpu = ggml_tallocr_get_buffer (ctx->alloc_cpu );
10007
- size_t buf_size = ggml_backend_buffer_get_size (buf_cpu);
10008
- ctx->buf_cpu_ub .push_back (buf_cpu);
10009
- int n_ub = 64 ;
10010
- for (int i = 1 ; i < n_ub; ++i) {
10011
- ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer (llama_default_buffer_type_cpu (true ), buf_size);
10012
- ctx->buf_cpu_ub .push_back (buf);
10020
+ const int n_ub = 16 ;
10021
+ ctx->n_compute_bufs = n_ub;
10022
+
10023
+ for (ggml_backend_t b : ctx->backends ) {
10024
+ ggml_tallocr_t alloc = ggml_backend_sched_get_tallocr (ctx->sched , b);
10025
+ ggml_backend_buffer_t buf = ggml_tallocr_get_buffer (alloc);
10026
+ size_t buf_size = ggml_backend_buffer_get_size (buf);
10027
+ ctx->bufs_compute [b].push_back (buf);
10028
+ auto * buft = ggml_backend_buffer_get_type (buf);
10029
+ for (int i = 1 ; i < n_ub; ++i) {
10030
+ ggml_backend_buffer_t buf = ggml_backend_buft_alloc_buffer (buft, buf_size);
10031
+ if (buf == nullptr ) {
10032
+ LLAMA_LOG_ERROR (" %s: failed to allocate compute buffer\n " , __func__);
10033
+ llama_free (ctx);
10034
+ return nullptr ;
10035
+ }
10036
+ ctx->bufs_compute [b].push_back (buf);
10037
+ }
10013
10038
}
10039
+
10014
10040
// allocate buffer for logits output
10015
10041
ctx->buf_logits = ggml_backend_buft_alloc_buffer (llama_default_buffer_type_cpu (true ), hparams.n_vocab *cparams.n_ctx *sizeof (float ));
10016
10042
if (ctx->buf_logits == nullptr ) {
@@ -10816,21 +10842,21 @@ int32_t llama_decode(
10816
10842
10817
10843
float * llama_get_logits (struct llama_context * ctx) {
10818
10844
ggml_backend_sched_synchronize (ctx->sched );
10819
- ctx->buf_cpu_ub_cur = 0 ;
10845
+ ctx->i_compute_buf = 0 ;
10820
10846
return ctx->logits ;
10821
10847
}
10822
10848
10823
10849
float * llama_get_logits_ith (struct llama_context * ctx, int32_t i) {
10824
10850
ggml_backend_sched_synchronize (ctx->sched );
10825
- ctx->buf_cpu_ub_cur = 0 ;
10851
+ ctx->i_compute_buf = 0 ;
10826
10852
10827
10853
assert (ctx->logits_valid .at (i));
10828
10854
return ctx->logits + i*ctx->model .hparams .n_vocab ;
10829
10855
}
10830
10856
10831
10857
float * llama_get_embeddings (struct llama_context * ctx) {
10832
10858
ggml_backend_sched_synchronize (ctx->sched );
10833
- ctx->buf_cpu_ub_cur = 0 ;
10859
+ ctx->i_compute_buf = 0 ;
10834
10860
10835
10861
return ctx->embedding .data ();
10836
10862
}
0 commit comments