@@ -2450,8 +2450,8 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
2450
2450
}
2451
2451
2452
2452
// Copies the state to the specified destination address
2453
- size_t llama_copy_state_data (struct llama_context * ctx, uint8_t * dest ) {
2454
- uint8_t * out = dest ;
2453
+ size_t llama_copy_state_data (struct llama_context * ctx, uint8_t * dst ) {
2454
+ uint8_t * out = dst ;
2455
2455
2456
2456
// copy rng
2457
2457
{
@@ -2511,7 +2511,9 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2511
2511
2512
2512
if (kv_size) {
2513
2513
const size_t elt_size = ggml_element_size (kv_self.k );
2514
+
2514
2515
char buffer[4096 ];
2516
+
2515
2517
ggml_context * cpy_ctx = ggml_init ({ sizeof (buffer), buffer, /* no_alloc */ true });
2516
2518
ggml_cgraph gf{};
2517
2519
gf.n_threads = 1 ;
@@ -2535,10 +2537,12 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2535
2537
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, k3d, kout3d));
2536
2538
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, v3d, vout3d));
2537
2539
ggml_graph_compute (cpy_ctx, &gf);
2540
+
2541
+ ggml_free (cpy_ctx);
2538
2542
}
2539
2543
}
2540
2544
2541
- const size_t written = out - dest ;
2545
+ const size_t written = out - dst ;
2542
2546
const size_t max_size = llama_get_state_size (ctx);
2543
2547
2544
2548
LLAMA_ASSERT (written <= max_size);
@@ -2548,15 +2552,15 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dest) {
2548
2552
2549
2553
// Sets the state reading from the specified source address
2550
2554
size_t llama_set_state_data (struct llama_context * ctx, const uint8_t * src) {
2551
- const uint8_t * in = src;
2555
+ const uint8_t * inp = src;
2552
2556
2553
2557
// set rng
2554
2558
{
2555
2559
size_t rng_size;
2556
2560
char rng_buf[LLAMA_MAX_RNG_STATE];
2557
2561
2558
- memcpy (&rng_size, in , sizeof (rng_size)); in += sizeof (rng_size);
2559
- memcpy (&rng_buf[0 ], in , LLAMA_MAX_RNG_STATE); in += LLAMA_MAX_RNG_STATE;
2562
+ memcpy (&rng_size, inp , sizeof (rng_size)); inp += sizeof (rng_size);
2563
+ memcpy (&rng_buf[0 ], inp , LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
2560
2564
2561
2565
std::stringstream rng_ss;
2562
2566
rng_ss.str (std::string (&rng_buf[0 ], rng_size));
@@ -2570,30 +2574,30 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2570
2574
size_t logits_cap;
2571
2575
size_t logits_size;
2572
2576
2573
- memcpy (&logits_cap, in , sizeof (logits_cap)); in += sizeof (logits_cap);
2574
- memcpy (&logits_size, in , sizeof (logits_size)); in += sizeof (logits_size);
2577
+ memcpy (&logits_cap, inp , sizeof (logits_cap)); inp += sizeof (logits_cap);
2578
+ memcpy (&logits_size, inp , sizeof (logits_size)); inp += sizeof (logits_size);
2575
2579
2576
2580
LLAMA_ASSERT (ctx->logits .capacity () == logits_cap);
2577
2581
2578
2582
if (logits_size) {
2579
2583
ctx->logits .resize (logits_size);
2580
- memcpy (ctx->logits .data (), in , logits_size * sizeof (float ));
2584
+ memcpy (ctx->logits .data (), inp , logits_size * sizeof (float ));
2581
2585
}
2582
2586
2583
- in += logits_cap * sizeof (float );
2587
+ inp += logits_cap * sizeof (float );
2584
2588
}
2585
2589
2586
2590
// set embeddings
2587
2591
{
2588
2592
size_t embedding_size;
2589
2593
2590
- memcpy (&embedding_size, in , sizeof (embedding_size)); in += sizeof (embedding_size);
2594
+ memcpy (&embedding_size, inp , sizeof (embedding_size)); inp += sizeof (embedding_size);
2591
2595
2592
2596
LLAMA_ASSERT (ctx->embedding .capacity () == embedding_size);
2593
2597
2594
2598
if (embedding_size) {
2595
- memcpy (ctx->embedding .data (), in , embedding_size * sizeof (float ));
2596
- in += embedding_size * sizeof (float );
2599
+ memcpy (ctx->embedding .data (), inp , embedding_size * sizeof (float ));
2600
+ inp += embedding_size * sizeof (float );
2597
2601
}
2598
2602
}
2599
2603
@@ -2608,25 +2612,27 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2608
2612
size_t kv_size;
2609
2613
int kv_ntok;
2610
2614
2611
- memcpy (&kv_size, in , sizeof (kv_size)); in += sizeof (kv_size);
2612
- memcpy (&kv_ntok, in , sizeof (kv_ntok)); in += sizeof (kv_ntok);
2615
+ memcpy (&kv_size, inp , sizeof (kv_size)); inp += sizeof (kv_size);
2616
+ memcpy (&kv_ntok, inp , sizeof (kv_ntok)); inp += sizeof (kv_ntok);
2613
2617
2614
2618
if (kv_size) {
2615
2619
LLAMA_ASSERT (kv_self.buf .size == kv_size);
2616
2620
2617
2621
const size_t elt_size = ggml_element_size (kv_self.k );
2622
+
2618
2623
char buffer[4096 ];
2624
+
2619
2625
ggml_context * cpy_ctx = ggml_init ({ sizeof (buffer), buffer, /* no_alloc */ true });
2620
2626
ggml_cgraph gf{};
2621
2627
gf.n_threads = 1 ;
2622
2628
2623
2629
ggml_tensor * kin3d = ggml_new_tensor_3d (cpy_ctx, kv_self.k ->type , n_embd, kv_ntok, n_layer);
2624
- kin3d->data = (void *) in ;
2625
- in += ggml_nbytes (kin3d);
2630
+ kin3d->data = (void *) inp ;
2631
+ inp += ggml_nbytes (kin3d);
2626
2632
2627
2633
ggml_tensor * vin3d = ggml_new_tensor_3d (cpy_ctx, kv_self.v ->type , kv_ntok, n_embd, n_layer);
2628
- vin3d->data = (void *) in ;
2629
- in += ggml_nbytes (vin3d);
2634
+ vin3d->data = (void *) inp ;
2635
+ inp += ggml_nbytes (vin3d);
2630
2636
2631
2637
ggml_tensor * k3d = ggml_view_3d (cpy_ctx, kv_self.k ,
2632
2638
n_embd, kv_ntok, n_layer,
@@ -2639,12 +2645,14 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
2639
2645
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, kin3d, k3d));
2640
2646
ggml_build_forward_expand (&gf, ggml_cpy (cpy_ctx, vin3d, v3d));
2641
2647
ggml_graph_compute (cpy_ctx, &gf);
2648
+
2649
+ ggml_free (cpy_ctx);
2642
2650
}
2643
2651
2644
2652
ctx->model .kv_self .n = kv_ntok;
2645
2653
}
2646
2654
2647
- const size_t nread = in - src;
2655
+ const size_t nread = inp - src;
2648
2656
const size_t max_size = llama_get_state_size (ctx);
2649
2657
2650
2658
LLAMA_ASSERT (nread <= max_size);
0 commit comments