Apply suggestions to llama.cpp and llama.h

thomasantony · thomasantony · commit 63506ec2dade · 2023-03-17T19:19:19.000-07:00
diff --git a/llama.cpp b/llama.cpp
@@ -818,15 +818,15 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
 
 /* External API */
 
-const std::vector<gpt_vocab::id>& llama_context_get_embd(const llama_context& ctx) {
+const std::vector<gpt_vocab::id>& llama_context_get_embedding(const llama_context& ctx) {
     return ctx.state->embd;
 }
 gpt_vocab& llama_context_get_vocab(llama_context& ctx) {
     return ctx.vocab;
 }
-bool llama_context_not_finished(const llama_context& ctx)
+bool llama_context_is_finished(const llama_context& ctx)
 {
-    return ctx.state->remaining_tokens > 0;
+    return ctx.state->remaining_tokens <= 0;
 }
 const std::vector<gpt_vocab::id> llama_tokenize_text(const llama_context& ctx, const std::string& text) {
     return llama_tokenize(ctx.vocab, text, true);
@@ -1137,7 +1137,7 @@ bool llama_eval(
     return true;
 }
 
-bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {
+bool llama_update_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {
     llama_state& state = *ctx.state;
     llama_model& model = ctx.model;
     const gpt_params& params = ctx.params;
@@ -1173,9 +1173,9 @@ bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text,
     return true;
 }
 
-/// @brief  Injests a batch of input tokens into the context
+/// @brief  Ingests a batch of input tokens into the context
 /// @param ctx 
-void llama_injest_input_batch(llama_context& ctx)
+void llama_ingest_input_batch(llama_context& ctx)
 {
     llama_state& state = *ctx.state;
     const gpt_params& params = ctx.params;
@@ -1241,22 +1241,22 @@ gpt_vocab::id llama_sample_token(llama_context& ctx)
     }
     return id;
 }
-/// @brief Injest all input (in multiple batches) into model and run call predict()
+/// @brief Ingest all input (in multiple batches) into model and run call predict()
 /// @param ctx  
-bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_existing)
+bool llama_ingest_input(llama_context& ctx, const std::string& text, bool clear_existing)
 {
     llama_state& state = *ctx.state;
 
     // Initialize context, tokenize text and clear existing state if necessary
-    if(!state.is_initialized && !llama_init_context_with_prompt(ctx, text, clear_existing))
+    if(!state.is_initialized && !llama_update_context_with_prompt(ctx, text, clear_existing))
     {
         return false;
     }
 
-    // Injest the tokens into the model one batch at a time
+    // ingest the tokens into the model one batch at a time
     while (state.has_more_input()) 
     {
-        llama_injest_input_batch(ctx);
+        llama_ingest_input_batch(ctx);
         if (state.embd.size() >= 0) {
             if(!llama_predict(ctx))
             {
@@ -1268,7 +1268,7 @@ bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_
     }
     return true;
 }
-bool llama_inference(llama_context& ctx, gpt_vocab::id& id) {
+bool llama_infer(llama_context& ctx, gpt_vocab::id& id) {
     llama_state& state = *ctx.state;
 
     // Tokenize text if we are starting out
diff --git a/llama.h b/llama.h
@@ -39,26 +39,27 @@ struct llama_hparams {
 
 struct llama_context;
 
-void llama_free_context(llama_context* ctx);
+// Startup
+llama_context* llama_init_from_params(const gpt_params& params);
 
-const std::vector<gpt_vocab::id>& llama_context_get_embd(const llama_context& ctx);
-gpt_vocab& llama_context_get_vocab(llama_context& ctx);
-bool llama_context_not_finished(const llama_context& ctx);
+// Input processing and inference
+bool llama_ingest_input(llama_context& ctx, const std::string& text, bool clear_existing = true);
+bool llama_context_is_finished(const llama_context& ctx);
+bool llama_update_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing = true);
 const std::vector<gpt_vocab::id> llama_tokenize_text(const llama_context& ctx, const std::string& text);
+bool llama_infer(llama_context& ctx, gpt_vocab::id& model_output);
 
-const std::vector<gpt_vocab::id>& llama_context_get_last_n_tokens(const llama_context& ctx);
-bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing = true);
-
-// Various functions for loading a ggml LLaMA model.
-llama_context* llama_init_from_params(const gpt_params& params);
+// Teardown
+void llama_free_context(llama_context* ctx);
 
-// Run inference on a LLaMA model using llama_context.
-std::vector<float> llama_eval(llama_context& ctx, const gpt_params& params, std::string& text);
+// Getters and setters
+gpt_vocab& llama_context_get_vocab(llama_context& ctx);
+const std::vector<gpt_vocab::id>& llama_context_get_embedding(const llama_context& ctx);
+const std::vector<gpt_vocab::id>& llama_context_get_last_n_tokens(const llama_context& ctx);
 
+// Other
 bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype);
 
-bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_existing = true);
-
-bool llama_inference(llama_context& ctx, gpt_vocab::id& model_output);
+// Stats
 void llama_print_context_info(const llama_context& ctx);
 void llama_print_end_stats(const llama_context& ctx);

Original file line number	Diff line number	Diff line change
`@@ -818,15 +818,15 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna`
`818`	`818`
`819`	`819`	`/* External API */`
`820`	`820`
`821`		`-const std::vector<gpt_vocab::id>& llama_context_get_embd(const llama_context& ctx) {`
	`821`	`+const std::vector<gpt_vocab::id>& llama_context_get_embedding(const llama_context& ctx) {`
`822`	`822`	`return ctx.state->embd;`
`823`	`823`	`}`
`824`	`824`	`gpt_vocab& llama_context_get_vocab(llama_context& ctx) {`
`825`	`825`	`return ctx.vocab;`
`826`	`826`	`}`
`827`		`-bool llama_context_not_finished(const llama_context& ctx)`
	`827`	`+bool llama_context_is_finished(const llama_context& ctx)`
`828`	`828`	`{`
`829`		`- return ctx.state->remaining_tokens > 0;`
	`829`	`+ return ctx.state->remaining_tokens <= 0;`
`830`	`830`	`}`
`831`	`831`	`const std::vector<gpt_vocab::id> llama_tokenize_text(const llama_context& ctx, const std::string& text) {`
`832`	`832`	`return llama_tokenize(ctx.vocab, text, true);`
`@@ -1137,7 +1137,7 @@ bool llama_eval(`
`1137`	`1137`	`return true;`
`1138`	`1138`	`}`
`1139`	`1139`
`1140`		`-bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {`
	`1140`	`+bool llama_update_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {`
`1141`	`1141`	`llama_state& state = *ctx.state;`
`1142`	`1142`	`llama_model& model = ctx.model;`
`1143`	`1143`	`const gpt_params& params = ctx.params;`
`@@ -1173,9 +1173,9 @@ bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text,`
`1173`	`1173`	`return true;`
`1174`	`1174`	`}`
`1175`	`1175`
`1176`		`-/// @brief Injests a batch of input tokens into the context`
	`1176`	`+/// @brief Ingests a batch of input tokens into the context`
`1177`	`1177`	`/// @param ctx`
`1178`		`-void llama_injest_input_batch(llama_context& ctx)`
	`1178`	`+void llama_ingest_input_batch(llama_context& ctx)`
`1179`	`1179`	`{`
`1180`	`1180`	`llama_state& state = *ctx.state;`
`1181`	`1181`	`const gpt_params& params = ctx.params;`
`@@ -1241,22 +1241,22 @@ gpt_vocab::id llama_sample_token(llama_context& ctx)`
`1241`	`1241`	`}`
`1242`	`1242`	`return id;`
`1243`	`1243`	`}`
`1244`		`-/// @brief Injest all input (in multiple batches) into model and run call predict()`
	`1244`	`+/// @brief Ingest all input (in multiple batches) into model and run call predict()`
`1245`	`1245`	`/// @param ctx`
`1246`		`-bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_existing)`
	`1246`	`+bool llama_ingest_input(llama_context& ctx, const std::string& text, bool clear_existing)`
`1247`	`1247`	`{`
`1248`	`1248`	`llama_state& state = *ctx.state;`
`1249`	`1249`
`1250`	`1250`	`// Initialize context, tokenize text and clear existing state if necessary`
`1251`		`- if(!state.is_initialized && !llama_init_context_with_prompt(ctx, text, clear_existing))`
	`1251`	`+ if(!state.is_initialized && !llama_update_context_with_prompt(ctx, text, clear_existing))`
`1252`	`1252`	`{`
`1253`	`1253`	`return false;`
`1254`	`1254`	`}`
`1255`	`1255`
`1256`		`- // Injest the tokens into the model one batch at a time`
	`1256`	`+ // ingest the tokens into the model one batch at a time`
`1257`	`1257`	`while (state.has_more_input())`
`1258`	`1258`	`{`
`1259`		`- llama_injest_input_batch(ctx);`
	`1259`	`+ llama_ingest_input_batch(ctx);`
`1260`	`1260`	`if (state.embd.size() >= 0) {`
`1261`	`1261`	`if(!llama_predict(ctx))`
`1262`	`1262`	`{`
`@@ -1268,7 +1268,7 @@ bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_`
`1268`	`1268`	`}`
`1269`	`1269`	`return true;`
`1270`	`1270`	`}`
`1271`		`-bool llama_inference(llama_context& ctx, gpt_vocab::id& id) {`
	`1271`	`+bool llama_infer(llama_context& ctx, gpt_vocab::id& id) {`
`1272`	`1272`	`llama_state& state = *ctx.state;`
`1273`	`1273`
`1274`	`1274`	`// Tokenize text if we are starting out`