Make llama_kv_cache copyable+moveable.

mattpulver · mattpulver · commit 4b20567965a2 · 2023-07-31T11:09:46.000-04:00
diff --git a/llama.cpp b/llama.cpp
@@ -246,26 +246,67 @@ struct llama_layer {
     struct ggml_tensor * w3;
 };
 
-struct llama_kv_cache {
+class llama_kv_cache {
+    // Hide ctx as it requires a custom deleter ggml_free.
+    std::shared_ptr<ggml_context> ctx;
+  public:
+
     struct ggml_tensor * k = NULL;
     struct ggml_tensor * v = NULL;
 
-    struct ggml_context * ctx = NULL;
-
     llama_ctx_buffer buf;
 
     int n; // number of tokens currently in the cache
 
-    ~llama_kv_cache() {
-        if (ctx) {
-            ggml_free(ctx);
-        }
+    ggml_context* get_ctx() { return ctx.get(); }
+    ggml_context const* get_ctx() const { return ctx.get(); }
+    void set_ctx(ggml_context* ctx) {
+      this->ctx = std::shared_ptr<ggml_context>(ctx, ggml_free);
+    }
 
+    llama_kv_cache() = default;
+    ~llama_kv_cache() {
 #ifdef GGML_USE_CUBLAS
         ggml_cuda_free_data(k);
         ggml_cuda_free_data(v);
 #endif // GGML_USE_CUBLAS
     }
+    llama_kv_cache(llama_kv_cache const& rhs)
+        : ctx(rhs.ctx.get(), ggml_free)
+        , k(ggml_dup_tensor(rhs.ctx.get(), rhs.k))
+        , v(ggml_dup_tensor(rhs.ctx.get(), rhs.v))
+        , buf(rhs.buf)
+        , n(rhs.n)
+        { }
+    llama_kv_cache& operator=(llama_kv_cache const& rhs) {
+        this->~llama_kv_cache();
+        ctx = rhs.ctx;
+        k = rhs.k ? ggml_dup_tensor(rhs.ctx.get(), rhs.k) : NULL;
+        v = rhs.v ? ggml_dup_tensor(rhs.ctx.get(), rhs.v) : NULL;
+        buf = rhs.buf;
+        n = rhs.n;
+        return *this;
+    }
+    llama_kv_cache(llama_kv_cache&& rhs)
+        : ctx(std::move(rhs.ctx))
+        , k(rhs.k)
+        , v(rhs.v)
+        , buf(std::move(rhs.buf))
+        , n(rhs.n)
+        {
+          rhs.k = NULL;
+          rhs.v = NULL;
+        }
+    llama_kv_cache& operator=(llama_kv_cache&& rhs) {
+        this->~llama_kv_cache();
+        ctx = std::move(rhs.ctx);
+        std::swap(k, rhs.k);
+        std::swap(v, rhs.v);
+        buf = std::move(rhs.buf);
+        n = rhs.n;
+        return *this;
+    }
+
 };
 
 struct llama_vocab {
@@ -863,15 +904,15 @@ static bool kv_cache_init(
     params.mem_buffer = cache.buf.addr;
     params.no_alloc   = false;
 
-    cache.ctx = ggml_init(params);
+    cache.set_ctx(ggml_init(params));
 
-    if (!cache.ctx) {
+    if (!cache.get_ctx()) {
         fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
         return false;
     }
 
-    cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
-    cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
+    cache.k = ggml_new_tensor_1d(cache.get_ctx(), wtype, n_elements);
+    cache.v = ggml_new_tensor_1d(cache.get_ctx(), wtype, n_elements);
     ggml_set_name(cache.k, "cache_k");
     ggml_set_name(cache.v, "cache_v");
 
@@ -1410,7 +1451,7 @@ static struct ggml_cgraph * llama_build_graph(
 
     const auto & kv_self = lctx.kv_self;
 
-    LLAMA_ASSERT(!!kv_self.ctx);
+    LLAMA_ASSERT(!!kv_self.get_ctx());
 
     const int64_t n_embd      = hparams.n_embd;
     const int64_t n_layer     = hparams.n_layer;
@@ -2878,6 +2919,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
 }
 
 struct beam {
+  llama_kv_cache kv_cache;
   std::vector<llama_token> tokens;
   float p;  // Cumulative beam probability (renormalized with each token)
   // end-of-sentence
@@ -2948,13 +2990,15 @@ void fill_next_beams_by_top_probabilities(llama_context* ctx, std::vector<beam>&
       }
     } else if (next_beams.front().p < b.p) {
       std::pop_heap(next_beams.begin(), next_beams.end(), comp);
-      next_beams.back() = b;
+      next_beams.back() = std::move(b);
       std::push_heap(next_beams.begin(), next_beams.end(), comp);
     }
   } else {
     // b is not at end-of-sentence, so branch with next top_k tokens.
     if (!b.tokens.empty()) {
+      std::swap(ctx->kv_self, const_cast<beam&>(b).kv_cache);
       llama_eval(ctx, b.tokens.data(), b.tokens.size(), n_past, n_threads);
+      std::swap(ctx->kv_self, const_cast<beam&>(b).kv_cache);
     }
     logit_info li(ctx);
     std::vector<llama_token_data> next_tokens = li.top_k(beam_width);
@@ -3006,11 +3050,11 @@ const char* llama_beam_search(llama_context * ctx, int const beam_width,
 
     std::vector<beam> beams;
     beams.reserve(beam_width);
-    beams.push_back({{}, 1.0});
+    beams.push_back({ctx->kv_self, {}, 1.0});
     std::vector<beam> next_beams;
     next_beams.reserve(beam_width);
     // Loop while there are any beams that have not yet reached end-of-sentence.
-    // If the top beam is at end-of-sentence, then finish since all other
+    // If the highest probability beam is at end-of-sentence, then finish since all other
     // beam probabilities can only decrease.
     auto const eos = [](beam const& b) { return b.eos(); };
     for (int i=0 ; i<n_predict && !eos(top_beam(beams)) &&