common : refactor args

ggerganov · ggerganov · commit 7f9cc2058c38 · 2024-11-24T14:55:16.000+02:00
ggml-ci
diff --git a/common/arg.cpp b/common/arg.cpp
diff --git a/common/common.cpp b/common/common.cpp
@@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) {
         common_lora_adapters_apply(lctx, iparams.lora_adapters);
     }
 
-    if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
+    if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {
         LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);
-        params.sparams.ignore_eos = false;
+        params.sampling.ignore_eos = false;
     }
 
     if (params.warmup) {
diff --git a/common/common.h b/common/common.h
@@ -103,8 +103,8 @@ enum dimre_method {
     DIMRE_METHOD_MEAN,
 };
 
-// sampler parameters
-struct common_sampler_params {
+// sampling parameters
+struct common_params_sampling {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
 
     int32_t n_prev             = 64;    // number of previous tokens to remember
@@ -155,20 +155,30 @@ struct common_sampler_params {
     std::string print() const;
 };
 
+struct common_params_speculative {
+    int32_t n_ctx        =  4096; // draft context size
+    int32_t n_max        =     5; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min        =     0; // minimum number of draft tokens to use for speculative decoding
+    int32_t n_gpu_layers =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+    float   p_split      =  0.1f; // speculative decoding split probability
+    float   p_min        =  0.9f; // minimum speculative decoding probability (greedy)
+
+    struct cpu_params cpuparams;
+    struct cpu_params cpuparams_batch;
+
+    std::string model = ""; // draft model for speculative decoding                          // NOLINT
+};
+
 struct common_params {
     int32_t n_predict             =    -1; // new tokens to predict
     int32_t n_ctx                 =  4096; // context size
     int32_t n_batch               =  2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_ubatch              =   512; // physical batch size for prompt processing (must be >=32 to use BLAS)
     int32_t n_keep                =     0; // number of tokens to keep from initial prompt
-    int32_t n_draft               =     5; // number of tokens to draft during speculative decoding
-    int32_t n_draft_min           =     0; // minimum number of draft tokens to use for speculative decoding
     int32_t n_chunks              =    -1; // max number of chunks to process (-1 = unlimited)
     int32_t n_parallel            =     1; // number of parallel sequences to decode
     int32_t n_sequences           =     1; // number of sequences to decode
-    float   p_split               =  0.1f; // speculative decoding split probability
     int32_t n_gpu_layers          =    -1; // number of layers to store in VRAM (-1 - use default)
-    int32_t n_gpu_layers_draft    =    -1; // number of layers to store in VRAM for the draft model (-1 - use default)
     int32_t main_gpu              =     0; // the GPU that is used for scratch and small tensors
     float   tensor_split[128]     =   {0}; // how split tensors should be distributed across GPUs
     int32_t grp_attn_n            =     1; // group-attention factor
@@ -185,8 +195,6 @@ struct common_params {
 
     struct cpu_params cpuparams;
     struct cpu_params cpuparams_batch;
-    struct cpu_params draft_cpuparams;
-    struct cpu_params draft_cpuparams_batch;
 
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
@@ -198,10 +206,10 @@ struct common_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
 
-    struct common_sampler_params sparams;
+    struct common_params_sampling sampling;
+    struct common_params_speculative speculative;
 
     std::string model                = ""; // model path                                                    // NOLINT
-    std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
     std::string model_alias          = "unknown"; // model alias                                            // NOLINT
     std::string model_url            = ""; // model url to download                                         // NOLINT
     std::string hf_token             = ""; // HF token                                                      // NOLINT
diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -99,7 +99,7 @@ struct ring_buffer {
 };
 
 struct common_sampler {
-    common_sampler_params params;
+    common_params_sampling params;
 
     struct llama_sampler * grmr;
     struct llama_sampler * chain;
@@ -125,7 +125,7 @@ struct common_sampler {
     }
 };
 
-std::string common_sampler_params::print() const {
+std::string common_params_sampling::print() const {
     char result[1024];
 
     snprintf(result, sizeof(result),
@@ -141,7 +141,7 @@ std::string common_sampler_params::print() const {
     return std::string(result);
 }
 
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params) {
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params) {
     llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
 
     lparams.no_perf = params.no_perf;
diff --git a/common/sampling.h b/common/sampling.h
@@ -36,7 +36,7 @@ struct common_sampler;
 
 // llama_sampler API overloads
 
-struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_sampler_params & params);
+struct common_sampler * common_sampler_init(const struct llama_model * model, const struct common_params_sampling & params);
 
 void common_sampler_free(struct common_sampler * gsmpl);
 
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -29,32 +29,32 @@ struct common_speculative * common_speculative_init(
     // TODO: optimize or pass from outside?
 #if 0
     {
-        common_sampler_params sparams;
-        sparams.no_perf = false;
+        common_params_sampling params;
+        params.no_perf = false;
 
-        sparams.top_k = 40;
-        sparams.top_p = 0.9;
+        params.top_k = 40;
+        params.top_p = 0.9;
 
-        sparams.samplers = {
+        params.samplers = {
             COMMON_SAMPLER_TYPE_TOP_K,
             COMMON_SAMPLER_TYPE_TOP_P,
             COMMON_SAMPLER_TYPE_INFILL,
         };
 
-        result->smpl = common_sampler_init(llama_get_model(ctx_dft), sparams);
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
     }
 #else
     {
-        common_sampler_params sparams;
-        sparams.no_perf = false;
+        common_params_sampling params;
+        params.no_perf = false;
 
-        sparams.top_k = 10;
+        params.top_k = 10;
 
-        sparams.samplers = {
+        params.samplers = {
             COMMON_SAMPLER_TYPE_TOP_K,
         };
 
-        result->smpl = common_sampler_init(llama_get_model(ctx_dft), sparams);
+        result->smpl = common_sampler_init(llama_get_model(ctx_dft), params);
     }
 #endif
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
@@ -68,10 +68,10 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sparams.top_k));
-    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sparams.top_p, params.sparams.min_keep));
-    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sparams.temp));
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_k(params.sampling.top_k));
+    llama_sampler_chain_add(smpl, llama_sampler_init_top_p(params.sampling.top_p, params.sampling.min_keep));
+    llama_sampler_chain_add(smpl, llama_sampler_init_temp (params.sampling.temp));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist (params.sampling.seed));
 
     if (ctx == NULL) {
         LOG_ERR("%s: error: failed to create the llama_context\n" , __func__);
diff --git a/examples/infill/infill.cpp b/examples/infill/infill.cpp
@@ -73,7 +73,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;
 
     console::init(params.simple_io, params.use_color);
     atexit([]() { console::cleanup(); });
diff --git a/examples/llava/llava-cli.cpp b/examples/llava/llava-cli.cpp
@@ -191,7 +191,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
 
     LOG("\n");
 
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
     if (!smpl) {
         LOG_ERR("%s: failed to initialize sampling subsystem\n", __func__);
         exit(1);
diff --git a/examples/llava/minicpmv-cli.cpp b/examples/llava/minicpmv-cli.cpp
@@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm
 
     LOG_INF("\n");
 
-    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);
+    struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);
     return smpl;
 }
 
diff --git a/examples/lookahead/lookahead.cpp b/examples/lookahead/lookahead.cpp
@@ -115,7 +115,7 @@ int main(int argc, char ** argv) {
     llama_batch batch = llama_batch_init(params.n_ctx, 0, W + G + 1);
 
     // target model sampling context
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
 
     // verification n-grams
     std::vector<ngram_data> ngrams_cur(G);
diff --git a/examples/lookup/lookup-stats.cpp b/examples/lookup/lookup-stats.cpp
@@ -21,7 +21,7 @@ int main(int argc, char ** argv){
 
     common_init();
 
-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;
 
     // init llama.cpp
     llama_backend_init();
@@ -40,6 +40,7 @@ int main(int argc, char ** argv){
     common_ngram_cache ngram_cache_context;
     common_ngram_cache ngram_cache_dynamic;
     common_ngram_cache ngram_cache_static;
+
     int64_t t_draft_flat_us = 0;
     int64_t t_draft_us = 0;
 
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -22,7 +22,7 @@ int main(int argc, char ** argv){
     common_init();
 
     // max. number of additional tokens to draft if match is found
-    const int n_draft = params.n_draft;
+    const int n_draft = params.speculative.n_max;
 
     const bool dump_kv_cache = params.dump_kv_cache;
 
@@ -102,7 +102,7 @@ int main(int argc, char ** argv){
 
     bool has_eos = false;
 
-    struct common_sampler * smpl = common_sampler_init(model, params.sparams);
+    struct common_sampler * smpl = common_sampler_init(model, params.sampling);
 
     std::vector<llama_token> draft;
 
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -100,7 +100,7 @@ int main(int argc, char ** argv) {
 
     common_init();
 
-    auto & sparams = params.sparams;
+    auto & sparams = params.sampling;
 
     // save choice to use color for later
     // (note for later: this is a slightly awkward choice)
diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {
     for (size_t i = 0; i < clients.size(); ++i) {
         auto & client = clients[i];
         client.id = i;
-        client.smpl = common_sampler_init(model, params.sparams);
+        client.smpl = common_sampler_init(model, params.sampling);
     }
 
     std::vector<llama_token> tokens_system;
diff --git a/examples/retrieval/retrieval.cpp b/examples/retrieval/retrieval.cpp
@@ -282,8 +282,8 @@ int main(int argc, char ** argv) {
                 return a.second > b.second;
             });
 
-            LOG("Top %d similar chunks:\n", params.sparams.top_k);
-            for (int i = 0; i < std::min(params.sparams.top_k, (int) chunks.size()); i++) {
+            LOG("Top %d similar chunks:\n", params.sampling.top_k);
+            for (int i = 0; i < std::min(params.sampling.top_k, (int) chunks.size()); i++) {
                 LOG("filename: %s\n", chunks[similarities[i].first].filename.c_str());
                 LOG("filepos: %lld\n", (long long int) chunks[similarities[i].first].filepos);
                 LOG("similarity: %f\n", similarities[i].second);
diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp
@@ -9,7 +9,7 @@ int main(int argc, char ** argv) {
     common_params params;
 
     params.prompt = "The quick brown fox";
-    params.sparams.seed = 1234;
+    params.sampling.seed = 1234;
 
     if (!common_params_parse(argc, argv, params, LLAMA_EXAMPLE_COMMON)) {
         return 1;
@@ -42,7 +42,7 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl, llama_sampler_init_dist(params.sampling.seed));
 
     // tokenize prompt
     auto tokens = common_tokenize(ctx, params.prompt, true);
@@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl2 = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl2, llama_sampler_init_dist(params.sampling.seed));
 
     printf("\nsecond run: %s", params.prompt.c_str());
 
@@ -169,7 +169,7 @@ int main(int argc, char ** argv) {
 
     llama_sampler * smpl3 = llama_sampler_chain_init(sparams);
 
-    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sparams.seed));
+    llama_sampler_chain_add(smpl3, llama_sampler_init_dist(params.sampling.seed));
 
     printf("\nsingle seq run: %s", params.prompt.c_str());
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -175,7 +175,7 @@ struct server_slot {
     // sampling
     json json_schema;
 
-    struct common_sampler_params sparams;
+    struct common_params_sampling sparams;
     struct common_sampler * smpl = nullptr;
 
     llama_token sampled;
@@ -687,7 +687,7 @@ struct server_context {
 
             SLT_INF(slot, "new slot n_ctx_slot = %d\n", slot.n_ctx);
 
-            slot.sparams = params.sparams;
+            slot.sparams = params.sampling;
 
             slot.callback_on_release = [this](int) {
                 queue_tasks.pop_deferred_task();
@@ -788,7 +788,7 @@ struct server_context {
     bool launch_slot_with_task(server_slot & slot, const server_task & task) {
         slot_params default_params;
         // Sampling parameter defaults are loaded from the global server context (but individual requests can still override them)
-        auto default_sparams = params.sparams;
+        auto default_sparams = params.sampling;
         const auto & data = task.data;
 
         if (data.count("__oaicompat") != 0) {
diff --git a/examples/speculative-simple/README.md b/examples/speculative-simple/README.md
@@ -1,3 +1,12 @@
 # llama.cpp/examples/speculative-simple
 
 Demonstration of basic greedy speculative decoding
+
+```bash
+./bin/llama-speculative-simple \
+    -m  ../models/qwen2.5-32b-coder-instruct/ggml-model-q8_0.gguf \
+    -md ../models/qwen2.5-1.5b-coder-instruct/ggml-model-q4_0.gguf \
+    -f test.txt -c 0 -ngl 99 --color \
+    --sampling-seq k --top-k 1 -fa --temp 0.0 \
+    -ngld 99 --draft-max 16 --draft-min 5 --draft-p-min 0.9
+```
diff --git a/examples/speculative-simple/speculative-simple.cpp b/examples/speculative-simple/speculative-simple.cpp
diff --git a/examples/speculative/speculative.cpp b/examples/speculative/speculative.cpp
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp

Original file line number	Diff line number	Diff line change
`@@ -925,9 +925,9 @@ struct common_init_result common_init_from_params(common_params & params) {`
`925`	`925`	`common_lora_adapters_apply(lctx, iparams.lora_adapters);`
`926`	`926`	`}`
`927`	`927`
`928`		`- if (params.sparams.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {`
	`928`	`+ if (params.sampling.ignore_eos && llama_token_eos(model) == LLAMA_TOKEN_NULL) {`
`929`	`929`	`LOG_WRN("%s: warning: model does not have an EOS token, ignoring --ignore-eos\n", __func__);`
`930`		`- params.sparams.ignore_eos = false;`
	`930`	`+ params.sampling.ignore_eos = false;`
`931`	`931`	`}`
`932`	`932`
`933`	`933`	`if (params.warmup) {`
Original file line number	Diff line number	Diff line change
`@@ -237,7 +237,7 @@ static struct common_sampler * llama_init(struct llava_context * ctx_llava, comm`
`237`	`237`
`238`	`238`	`LOG_INF("\n");`
`239`	`239`
`240`		`- struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sparams);`
	`240`	`+ struct common_sampler * smpl = common_sampler_init(ctx_llava->model, params->sampling);`
`241`	`241`	`return smpl;`
`242`	`242`	`}`
`243`	`243`
Original file line number	Diff line number	Diff line change
`@@ -160,7 +160,7 @@ int main(int argc, char ** argv) {`
`160`	`160`	`for (size_t i = 0; i < clients.size(); ++i) {`
`161`	`161`	`auto & client = clients[i];`
`162`	`162`	`client.id = i;`
`163`		`- client.smpl = common_sampler_init(model, params.sparams);`
	`163`	`+ client.smpl = common_sampler_init(model, params.sampling);`
`164`	`164`	`}`
`165`	`165`
`166`	`166`	`std::vector<llama_token> tokens_system;`