Apply suggestions to llama.cpp and llama.h

thomasantony · thomasantony · commit b0ed03b9ab90 · 2023-03-19T13:31:57.000-07:00
diff --git a/llama.cpp b/llama.cpp
@@ -810,15 +810,15 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
 
 /* External API */
 
-const std::vector<gpt_vocab::id>& llama_context_get_embd(const llama_context& ctx) {
+const std::vector<gpt_vocab::id>& llama_context_get_embedding(const llama_context& ctx) {
     return ctx.state->embd;
 }
 gpt_vocab& llama_context_get_vocab(llama_context& ctx) {
     return ctx.vocab;
 }
-bool llama_context_not_finished(const llama_context& ctx)
+bool llama_context_is_finished(const llama_context& ctx)
 {
-    return ctx.state->remaining_tokens > 0;
+    return ctx.state->remaining_tokens <= 0;
 }
 const std::vector<gpt_vocab::id> llama_tokenize_text(const llama_context& ctx, const std::string& text) {
     return llama_tokenize(ctx.vocab, text, true);
@@ -1129,7 +1129,7 @@ bool llama_eval(
     return true;
 }
 
-bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {
+bool llama_update_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {
     llama_state& state = *ctx.state;
     llama_model& model = ctx.model;
     const gpt_params& params = ctx.params;
@@ -1165,9 +1165,9 @@ bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text,
     return true;
 }
 
-/// @brief  Injests a batch of input tokens into the context
+/// @brief  Ingests a batch of input tokens into the context
 /// @param ctx 
-void llama_injest_input_batch(llama_context& ctx)
+void llama_ingest_input_batch(llama_context& ctx)
 {
     llama_state& state = *ctx.state;
     const gpt_params& params = ctx.params;
@@ -1233,22 +1233,22 @@ gpt_vocab::id llama_sample_token(llama_context& ctx)
     }
     return id;
 }
-/// @brief Injest all input (in multiple batches) into model and run call predict()
+/// @brief Ingest all input (in multiple batches) into model and run call predict()
 /// @param ctx  
-bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_existing)
+bool llama_ingest_input(llama_context& ctx, const std::string& text, bool clear_existing)
 {
     llama_state& state = *ctx.state;
 
     // Initialize context, tokenize text and clear existing state if necessary
-    if(!state.is_initialized && !llama_init_context_with_prompt(ctx, text, clear_existing))
+    if(!state.is_initialized && !llama_update_context_with_prompt(ctx, text, clear_existing))
     {
         return false;
     }
 
-    // Injest the tokens into the model one batch at a time
+    // ingest the tokens into the model one batch at a time
     while (state.has_more_input()) 
     {
-        llama_injest_input_batch(ctx);
+        llama_ingest_input_batch(ctx);
         if (state.embd.size() >= 0) {
             if(!llama_predict(ctx))
             {
@@ -1260,7 +1260,7 @@ bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_
     }
     return true;
 }
-bool llama_inference(llama_context& ctx, gpt_vocab::id& id) {
+bool llama_infer(llama_context& ctx, gpt_vocab::id& id) {
     llama_state& state = *ctx.state;
 
     // Tokenize text if we are starting out
diff --git a/llama.h b/llama.h
@@ -39,26 +39,27 @@ struct llama_hparams {
 
 struct llama_context;
 
-void llama_free_context(llama_context* ctx);
+// Startup
+llama_context* llama_init_from_params(const gpt_params& params);
 
-const std::vector<gpt_vocab::id>& llama_context_get_embd(const llama_context& ctx);
-gpt_vocab& llama_context_get_vocab(llama_context& ctx);
-bool llama_context_not_finished(const llama_context& ctx);
+// Input processing and inference
+bool llama_ingest_input(llama_context& ctx, const std::string& text, bool clear_existing = true);
+bool llama_context_is_finished(const llama_context& ctx);
+bool llama_update_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing = true);
 const std::vector<gpt_vocab::id> llama_tokenize_text(const llama_context& ctx, const std::string& text);
+bool llama_infer(llama_context& ctx, gpt_vocab::id& model_output);
 
-const std::vector<gpt_vocab::id>& llama_context_get_last_n_tokens(const llama_context& ctx);
-bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing = true);
-
-// Various functions for loading a ggml LLaMA model.
-llama_context* llama_init_from_params(const gpt_params& params);
+// Teardown
+void llama_free_context(llama_context* ctx);
 
-// Run inference on a LLaMA model using llama_context.
-std::vector<float> llama_eval(llama_context& ctx, const gpt_params& params, std::string& text);
+// Getters and setters
+gpt_vocab& llama_context_get_vocab(llama_context& ctx);
+const std::vector<gpt_vocab::id>& llama_context_get_embedding(const llama_context& ctx);
+const std::vector<gpt_vocab::id>& llama_context_get_last_n_tokens(const llama_context& ctx);
 
+// Other
 bool llama_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype);
 
-bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_existing = true);
-
-bool llama_inference(llama_context& ctx, gpt_vocab::id& model_output);
+// Stats
 void llama_print_context_info(const llama_context& ctx);
 void llama_print_end_stats(const llama_context& ctx);

Original file line number	Diff line number	Diff line change
`@@ -810,15 +810,15 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna`
`810`	`810`
`811`	`811`	`/* External API */`
`812`	`812`
`813`		`-const std::vector<gpt_vocab::id>& llama_context_get_embd(const llama_context& ctx) {`
	`813`	`+const std::vector<gpt_vocab::id>& llama_context_get_embedding(const llama_context& ctx) {`
`814`	`814`	`return ctx.state->embd;`
`815`	`815`	`}`
`816`	`816`	`gpt_vocab& llama_context_get_vocab(llama_context& ctx) {`
`817`	`817`	`return ctx.vocab;`
`818`	`818`	`}`
`819`		`-bool llama_context_not_finished(const llama_context& ctx)`
	`819`	`+bool llama_context_is_finished(const llama_context& ctx)`
`820`	`820`	`{`
`821`		`- return ctx.state->remaining_tokens > 0;`
	`821`	`+ return ctx.state->remaining_tokens <= 0;`
`822`	`822`	`}`
`823`	`823`	`const std::vector<gpt_vocab::id> llama_tokenize_text(const llama_context& ctx, const std::string& text) {`
`824`	`824`	`return llama_tokenize(ctx.vocab, text, true);`
`@@ -1129,7 +1129,7 @@ bool llama_eval(`
`1129`	`1129`	`return true;`
`1130`	`1130`	`}`
`1131`	`1131`
`1132`		`-bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {`
	`1132`	`+bool llama_update_context_with_prompt(llama_context& ctx, const std::string& text, bool clear_existing) {`
`1133`	`1133`	`llama_state& state = *ctx.state;`
`1134`	`1134`	`llama_model& model = ctx.model;`
`1135`	`1135`	`const gpt_params& params = ctx.params;`
`@@ -1165,9 +1165,9 @@ bool llama_init_context_with_prompt(llama_context& ctx, const std::string& text,`
`1165`	`1165`	`return true;`
`1166`	`1166`	`}`
`1167`	`1167`
`1168`		`-/// @brief Injests a batch of input tokens into the context`
	`1168`	`+/// @brief Ingests a batch of input tokens into the context`
`1169`	`1169`	`/// @param ctx`
`1170`		`-void llama_injest_input_batch(llama_context& ctx)`
	`1170`	`+void llama_ingest_input_batch(llama_context& ctx)`
`1171`	`1171`	`{`
`1172`	`1172`	`llama_state& state = *ctx.state;`
`1173`	`1173`	`const gpt_params& params = ctx.params;`
`@@ -1233,22 +1233,22 @@ gpt_vocab::id llama_sample_token(llama_context& ctx)`
`1233`	`1233`	`}`
`1234`	`1234`	`return id;`
`1235`	`1235`	`}`
`1236`		`-/// @brief Injest all input (in multiple batches) into model and run call predict()`
	`1236`	`+/// @brief Ingest all input (in multiple batches) into model and run call predict()`
`1237`	`1237`	`/// @param ctx`
`1238`		`-bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_existing)`
	`1238`	`+bool llama_ingest_input(llama_context& ctx, const std::string& text, bool clear_existing)`
`1239`	`1239`	`{`
`1240`	`1240`	`llama_state& state = *ctx.state;`
`1241`	`1241`
`1242`	`1242`	`// Initialize context, tokenize text and clear existing state if necessary`
`1243`		`- if(!state.is_initialized && !llama_init_context_with_prompt(ctx, text, clear_existing))`
	`1243`	`+ if(!state.is_initialized && !llama_update_context_with_prompt(ctx, text, clear_existing))`
`1244`	`1244`	`{`
`1245`	`1245`	`return false;`
`1246`	`1246`	`}`
`1247`	`1247`
`1248`		`- // Injest the tokens into the model one batch at a time`
	`1248`	`+ // ingest the tokens into the model one batch at a time`
`1249`	`1249`	`while (state.has_more_input())`
`1250`	`1250`	`{`
`1251`		`- llama_injest_input_batch(ctx);`
	`1251`	`+ llama_ingest_input_batch(ctx);`
`1252`	`1252`	`if (state.embd.size() >= 0) {`
`1253`	`1253`	`if(!llama_predict(ctx))`
`1254`	`1254`	`{`
`@@ -1260,7 +1260,7 @@ bool llama_injest_input(llama_context& ctx, const std::string& text, bool clear_`
`1260`	`1260`	`}`
`1261`	`1261`	`return true;`
`1262`	`1262`	`}`
`1263`		`-bool llama_inference(llama_context& ctx, gpt_vocab::id& id) {`
	`1263`	`+bool llama_infer(llama_context& ctx, gpt_vocab::id& id) {`
`1264`	`1264`	`llama_state& state = *ctx.state;`
`1265`	`1265`
`1266`	`1266`	`// Tokenize text if we are starting out`