@@ -3661,7 +3661,7 @@ static void llm_load_tensors(
3661
3661
LLAMA_LOG_INFO (" %s: VRAM used = %7.2f MiB\n " , __func__, vram_weights / 1024.0 / 1024.0 );
3662
3662
}
3663
3663
3664
- #if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3664
+ #if ( defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST) ) || defined(GGML_USE_CLBLAST)
3665
3665
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
3666
3666
3667
3667
LLAMA_LOG_INFO (" %s: offloading %d repeating layers to GPU\n " , __func__, n_gpu);
@@ -9830,17 +9830,12 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9830
9830
ggml_context * cpy_ctx = ggml_init ({ 6 *n_layer*ggml_tensor_overhead () + ggml_graph_overhead (), NULL , /* no_alloc */ true });
9831
9831
ggml_cgraph * gf = ggml_new_graph (cpy_ctx);
9832
9832
9833
- std::vector<std::vector< uint8_t >> kout2d_data (n_layer);
9834
- std::vector<std::vector< uint8_t >> vout2d_data (n_layer);
9833
+ std::vector<struct ggml_tensor *> kout2d (n_layer);
9834
+ std::vector<struct ggml_tensor *> vout2d (n_layer);
9835
9835
9836
9836
for (int il = 0 ; il < (int ) n_layer; ++il) {
9837
- ggml_tensor * kout2d = ggml_new_tensor_2d (cpy_ctx, kv_self.k_l [il]->type , n_embd, kv_head);
9838
- kout2d_data[il].resize (ggml_nbytes (kout2d));
9839
- kout2d->data = kout2d_data[il].data ();
9840
-
9841
- ggml_tensor * vout2d = ggml_new_tensor_2d (cpy_ctx, kv_self.v_l [il]->type , kv_head, n_embd);
9842
- vout2d_data[il].resize (ggml_nbytes (vout2d));
9843
- vout2d->data = vout2d_data[il].data ();
9837
+ kout2d[il] = ggml_new_tensor_2d (cpy_ctx, kv_self.k_l [il]->type , n_embd, kv_head);
9838
+ vout2d[il] = ggml_new_tensor_2d (cpy_ctx, kv_self.v_l [il]->type , kv_head, n_embd);
9844
9839
9845
9840
ggml_tensor * k2d = ggml_view_2d (cpy_ctx, kv_self.k_l [il],
9846
9841
n_embd, kv_head,
@@ -9850,21 +9845,28 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
9850
9845
kv_head, n_embd,
9851
9846
elt_size*n_ctx, 0 );
9852
9847
9853
- ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, k2d, kout2d));
9854
- ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, v2d, vout2d));
9848
+ ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, k2d, kout2d[il] ));
9849
+ ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, v2d, vout2d[il] ));
9855
9850
}
9856
9851
9857
- std::vector<uint8_t > work_buffer;
9858
- ggml_graph_compute_helper (work_buffer, gf, ctx->cparams .n_threads );
9852
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors (cpy_ctx, ctx->backend );
9859
9853
9860
- ggml_free (cpy_ctx);
9854
+ ggml_backend_graph_compute (ctx->backend , gf);
9855
+
9856
+ std::vector<uint8_t > tmp_buf;
9857
+ for (int il = 0 ; il < (int ) n_layer; ++il) {
9858
+ tmp_buf.resize (ggml_nbytes (kout2d[il]));
9859
+ ggml_backend_tensor_get (kout2d[il], tmp_buf.data (), 0 , tmp_buf.size ());
9860
+ data_ctx->write (tmp_buf.data (), tmp_buf.size ());
9861
9861
9862
- // our data is now in the kout2d_data and vout2d_data buffers
9863
- // write them to file
9864
- for (uint32_t il = 0 ; il < n_layer; ++il) {
9865
- data_ctx->write (kout2d_data[il].data (), kout2d_data[il].size ());
9866
- data_ctx->write (vout2d_data[il].data (), vout2d_data[il].size ());
9862
+ tmp_buf.resize (ggml_nbytes (vout2d[il]));
9863
+ ggml_backend_tensor_get (vout2d[il], tmp_buf.data (), 0 , tmp_buf.size ());
9864
+ data_ctx->write (tmp_buf.data (), tmp_buf.size ());
9867
9865
}
9866
+
9867
+ ggml_free (cpy_ctx);
9868
+
9869
+ ggml_backend_buffer_free (buf);
9868
9870
}
9869
9871
9870
9872
for (uint32_t i = 0 ; i < kv_size; ++i) {
@@ -9969,14 +9971,12 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9969
9971
ggml_context * cpy_ctx = ggml_init ({ 6 *n_layer*ggml_tensor_overhead () + ggml_graph_overhead (), NULL , /* no_alloc */ true });
9970
9972
ggml_cgraph * gf = ggml_new_graph (cpy_ctx);
9971
9973
9972
- for (int il = 0 ; il < n_layer; ++il) {
9973
- ggml_tensor * kin2d = ggml_new_tensor_2d (cpy_ctx, kv_self.k_l [il]->type , n_embd, kv_head);
9974
- kin2d->data = (void *) inp;
9975
- inp += ggml_nbytes (kin2d);
9974
+ std::vector<struct ggml_tensor *> kin2d (n_layer);
9975
+ std::vector<struct ggml_tensor *> vin2d (n_layer);
9976
9976
9977
- ggml_tensor * vin2d = ggml_new_tensor_2d (cpy_ctx, kv_self. v_l [il]-> type , kv_head, n_embd);
9978
- vin2d-> data = ( void *) inp ;
9979
- inp += ggml_nbytes (vin2d );
9977
+ for ( int il = 0 ; il < n_layer; ++il) {
9978
+ kin2d[il] = ggml_new_tensor_2d (cpy_ctx, kv_self. k_l [il]-> type , n_embd, kv_head) ;
9979
+ vin2d[il] = ggml_new_tensor_2d (cpy_ctx, kv_self. v_l [il]-> type , kv_head, n_embd );
9980
9980
9981
9981
ggml_tensor * k2d = ggml_view_2d (cpy_ctx, kv_self.k_l [il],
9982
9982
n_embd, kv_head,
@@ -9986,14 +9986,26 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
9986
9986
kv_head, n_embd,
9987
9987
elt_size*n_ctx, 0 );
9988
9988
9989
- ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, kin2d, k2d));
9990
- ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, vin2d, v2d));
9989
+ ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, kin2d[il], k2d));
9990
+ ggml_build_forward_expand (gf, ggml_cpy (cpy_ctx, vin2d[il], v2d));
9991
+ }
9992
+
9993
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors (cpy_ctx, ctx->backend );
9994
+
9995
+ // load data into the tensors
9996
+ for (int il = 0 ; il < n_layer; ++il) {
9997
+ ggml_backend_tensor_set (kin2d[il], inp, 0 , ggml_nbytes (kin2d[il]));
9998
+ inp += ggml_nbytes (kin2d[il]);
9999
+
10000
+ ggml_backend_tensor_set (vin2d[il], inp, 0 , ggml_nbytes (vin2d[il]));
10001
+ inp += ggml_nbytes (vin2d[il]);
9991
10002
}
9992
10003
9993
- std::vector<uint8_t > work_buffer;
9994
- ggml_graph_compute_helper (work_buffer, gf, ctx->cparams .n_threads );
10004
+ ggml_backend_graph_compute (ctx->backend , gf);
9995
10005
9996
10006
ggml_free (cpy_ctx);
10007
+
10008
+ ggml_backend_buffer_free (buf);
9997
10009
}
9998
10010
9999
10011
ctx->kv_self .head = kv_head;
0 commit comments