server : fix incorrect num_tokens_predicted (#3480)

jhen0409 · web-flow · commit e8b8d32e8663 · 2023-10-05T17:02:55.000+03:00
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -504,9 +504,11 @@ struct llama_server_context
                                            });
         }
 
+        bool tg = true;
         while (n_past < embd.size())
         {
             int n_eval = (int)embd.size() - n_past;
+            tg = n_eval == 1;
             if (n_eval > params.n_batch)
             {
                 n_eval = params.n_batch;
@@ -633,7 +635,9 @@ struct llama_server_context
 
             last_n_tokens.erase(last_n_tokens.begin());
             last_n_tokens.push_back(result.tok);
-            num_tokens_predicted++;
+            if (tg) {
+                num_tokens_predicted++;
+            }
         }
 
         // add it to the context
@@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)
 {
     const auto timings = llama_get_timings(llama.ctx);
 
-    assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));
-
     return json{
         {"prompt_n", timings.n_p_eval},
         {"prompt_ms", timings.t_p_eval_ms},

Original file line number	Diff line number	Diff line change
`@@ -504,9 +504,11 @@ struct llama_server_context`
`504`	`504`	`});`
`505`	`505`	`}`
`506`	`506`
	`507`	`+ bool tg = true;`
`507`	`508`	`while (n_past < embd.size())`
`508`	`509`	`{`
`509`	`510`	`int n_eval = (int)embd.size() - n_past;`
	`511`	`+ tg = n_eval == 1;`
`510`	`512`	`if (n_eval > params.n_batch)`
`511`	`513`	`{`
`512`	`514`	`n_eval = params.n_batch;`
`@@ -633,7 +635,9 @@ struct llama_server_context`
`633`	`635`
`634`	`636`	`last_n_tokens.erase(last_n_tokens.begin());`
`635`	`637`	`last_n_tokens.push_back(result.tok);`
`636`		`- num_tokens_predicted++;`
	`638`	`+ if (tg) {`
	`639`	`+ num_tokens_predicted++;`
	`640`	`+ }`
`637`	`641`	`}`
`638`	`642`
`639`	`643`	`// add it to the context`
`@@ -1124,8 +1128,6 @@ static json format_timings(llama_server_context &llama)`
`1124`	`1128`	`{`
`1125`	`1129`	`const auto timings = llama_get_timings(llama.ctx);`
`1126`	`1130`
`1127`		`- assert(timings.n_eval == ptrdiff_t(llama.num_tokens_predicted));`
`1128`		`-`
`1129`	`1131`	`return json{`
`1130`	`1132`	`{"prompt_n", timings.n_p_eval},`
`1131`	`1133`	`{"prompt_ms", timings.t_p_eval_ms},`