fix q8_0 v rollback

JohannesGaessler · JohannesGaessler · commit 086c6d6ed7b1 · 2023-09-13T00:06:24.000+02:00
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -41,14 +41,14 @@
 static llama_context           ** g_ctx;
 static llama_model             ** g_model;
 static gpt_params               * g_params;
-static std::vector<llama_token> * g_input_tokens;
+static std::vector<llama_token> * g_embd_inp;
 static std::ostringstream       * g_output_ss;
 static std::vector<llama_token> * g_output_tokens;
 static bool is_interacting = false;
 
 void write_logfile(
     const llama_context * ctx, const gpt_params & params, const llama_model * model,
-    const std::vector<llama_token> input_tokens, const std::string output, const std::vector<llama_token> output_tokens) {
+    const std::vector<llama_token> embd_inp, const std::string output, const std::vector<llama_token> output_tokens) {
 
     if (params.logdir.empty()) {
         return;
@@ -74,7 +74,7 @@ void write_logfile(
     fprintf(logfile, "binary: main\n");
     char model_desc[128];
     llama_model_desc(model, model_desc, sizeof(model_desc));
-    dump_non_result_info_yaml(logfile, params, ctx, timestamp, input_tokens, model_desc);
+    dump_non_result_info_yaml(logfile, params, ctx, timestamp, embd_inp, model_desc);
 
     fprintf(logfile, "\n");
     fprintf(logfile, "######################\n");
@@ -98,7 +98,7 @@ void sigint_handler(int signo) {
             console::cleanup();
             printf("\n");
             llama_print_timings(*g_ctx);
-            write_logfile(*g_ctx, *g_params, *g_model, *g_input_tokens, g_output_ss->str(), *g_output_tokens);
+            write_logfile(*g_ctx, *g_params, *g_model, *g_embd_inp, g_output_ss->str(), *g_output_tokens);
             _exit(130);
         }
     }
@@ -255,7 +255,7 @@ int main(int argc, char ** argv) {
     const bool add_bos = llama_vocab_type(ctx) == LLAMA_VOCAB_TYPE_SPM;
     LOG("add_bos: %d\n", add_bos);
 
-    std::vector<llama_token> embd_inp;
+    std::vector<llama_token> embd_inp; g_embd_inp = &embd_inp;
 
     if (params.interactive_first || params.instruct || !params.prompt.empty() || session_tokens.empty()) {
         LOG("tokenize the prompt\n");
@@ -482,7 +482,6 @@ int main(int argc, char ** argv) {
     int n_session_consumed = 0;
     int n_past_guidance    = 0;
 
-    std::vector<int>   input_tokens;  g_input_tokens  = &input_tokens;
     std::vector<int>   output_tokens; g_output_tokens = &output_tokens;
     std::ostringstream output_ss;     g_output_ss     = &output_ss;
 
@@ -678,9 +677,7 @@ int main(int argc, char ** argv) {
                 const std::string token_str = llama_token_to_piece(ctx, id);
                 printf("%s", token_str.c_str());
 
-                if (embd.size() > 1) {
-                    input_tokens.push_back(id);
-                } else {
+                if (embd.size() == 1) {
                     output_tokens.push_back(id);
                     output_ss << token_str;
                 }
@@ -860,7 +857,7 @@ int main(int argc, char ** argv) {
     }
 
     llama_print_timings(ctx);
-    write_logfile(ctx, params, model, input_tokens, output_ss.str(), output_tokens);
+    write_logfile(ctx, params, model, embd_inp, output_ss.str(), output_tokens);
 
     if (ctx_guidance) { llama_free(ctx_guidance); }
     llama_free(ctx);
diff --git a/llama.cpp b/llama.cpp
@@ -1129,6 +1129,9 @@ struct llama_context {
     // key + value cache for the self attention
     struct llama_kv_cache kv_self;
 
+    std::vector<llama_token> token_history;
+    int64_t previous_v_blck;
+
     // decode output (2-dimensional array: [n_tokens][n_vocab])
     std::vector<float> logits;
     bool logits_all = false;
@@ -2955,9 +2958,29 @@ static bool llama_eval_internal(
     const int64_t n_embd  = hparams.n_embd;
     const int64_t n_vocab = hparams.n_vocab;
 
+    std::vector<llama_token> tokens_v_redo;
+    const int64_t     v_blck_size = ggml_blck_size(kv_self.v->type);
+    const int64_t  current_v_blck = n_past / v_blck_size;
+
+    // if the v component of the KV cache is q8_0 the unquantized temporary values may have already been overwritten
+    // in that case we need to roll back to the beginning of a q8_0 block
+    const int64_t n_v_redo = lctx.previous_v_blck > current_v_blck ? n_past % v_blck_size : 0;
+    if (n_v_redo > 0) {
+        tokens_v_redo.insert(tokens_v_redo.end(),
+            lctx.token_history.begin() + n_past - n_v_redo,
+            lctx.token_history.begin() + n_past);
+        for (int64_t i = 0; i < n_tokens; ++i) {
+            tokens_v_redo.push_back(tokens[i]);
+        }
+
+        n_tokens = tokens_v_redo.size();
+        n_past  -= n_v_redo;
+    }
+    const llama_token * tokens_eff = n_v_redo > 0 ? tokens_v_redo.data() : tokens;
+
     ggml_allocr_reset(lctx.alloc);
 
-    ggml_cgraph * gf = llama_build_graph(lctx, tokens, embd, n_tokens, n_past);
+    ggml_cgraph * gf = llama_build_graph(lctx, tokens_eff, embd, n_tokens, n_past);
 
     ggml_allocr_alloc_graph(lctx.alloc, gf);
 
@@ -2984,7 +3007,7 @@ static bool llama_eval_internal(
     // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
     //       we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
     //       with the BLAS calls. need a better solution
-    if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
+    if (n_tokens >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
         n_threads = std::min(4, n_threads);
     }
 
@@ -3042,11 +3065,11 @@ static bool llama_eval_internal(
 
         if (lctx.logits_all) {
             logits_out.resize(n_vocab * N);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res), sizeof(float)*n_vocab*N);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + n_vocab*n_v_redo, sizeof(float)*n_vocab*N);
         } else {
             // return result for just the last token
             logits_out.resize(n_vocab);
-            memcpy(logits_out.data(), (float *) ggml_get_data(res) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+            memcpy(logits_out.data(), (float *) ggml_get_data(res) + n_vocab*(n_v_redo+N-1), sizeof(float)*n_vocab);
         }
     }
 
@@ -3058,6 +3081,12 @@ static bool llama_eval_internal(
         memcpy(embedding_out.data(), (float *) ggml_get_data(embeddings) + (n_embd*(N - 1)), sizeof(float)*n_embd);
     }
 
+    // update token history and how far the v component of the KV cache was filled (for q8_0 rollback)
+    for (int64_t i = 0; i < n_tokens; ++i) {
+        lctx.token_history[n_past + i] = tokens_eff[i];
+    }
+    lctx.previous_v_blck = (n_past + n_tokens) / v_blck_size;
+
     // measure the performance only for the single-token evals
     if (N == 1) {
         lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -5551,6 +5580,9 @@ struct llama_context * llama_new_context_with_model(
 
         const auto & hparams = ctx->model.hparams;
 
+        ctx->token_history.resize(hparams.n_ctx);
+        ctx->previous_v_blck = 0;
+
         // resized during inference
         if (params.logits_all) {
             ctx->logits.reserve(hparams.n_ctx*hparams.n_vocab);