Improve usability of --model-url & related flags (ggml-org#6930)

ochafik · nopperl · commit 2cc504492d32 · 2024-05-05T21:02:53.000+02:00
* args: default --model to models/ + filename from --model-url or --hf-file (or else legacy models/7B/ggml-model-f16.gguf)

* args: main &amp; server now call gpt_params_handle_model_default

* args: define DEFAULT_MODEL_PATH + update cli docs

* curl: check url of previous download (.json metadata w/ url, etag &amp; lastModified)

* args: fix update to quantize-stats.cpp

* curl: support legacy .etag / .lastModified companion files

* curl: rm legacy .etag file support

* curl: reuse regex across headers callback calls

* curl: unique_ptr to manage lifecycle of curl &amp; outfile

* curl: nit: no need for multiline regex flag

* curl: update failed test (model file collision) + gitignore *.gguf.json
diff --git a/.gitignore b/.gitignore
@@ -2,6 +2,7 @@
 *.a
 *.so
 *.gguf
+*.gguf.json
 *.bin
 *.exe
 *.dll
diff --git a/common/common.cpp b/common/common.cpp
diff --git a/common/common.h b/common/common.h
@@ -31,6 +31,8 @@
     fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
 } while(0)
 
+#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
+
 // build info
 extern int LLAMA_BUILD_NUMBER;
 extern char const *LLAMA_COMMIT;
@@ -92,7 +94,7 @@ struct gpt_params {
     // // sampling parameters
     struct llama_sampling_params sparams;
 
-    std::string model                = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model                = "";  // model path
     std::string model_draft          = "";  // draft model for speculative decoding
     std::string model_alias          = "unknown"; // model alias
     std::string model_url            = "";  // model url to download
@@ -171,6 +173,8 @@ struct gpt_params {
     std::vector<std::string> image; // path to image file(s)
 };
 
+void gpt_params_handle_model_default(gpt_params & params);
+
 bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides);
 
 bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params);
diff --git a/examples/main/README.md b/examples/main/README.md
@@ -66,7 +66,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
 
--   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`; inferred from `--model-url` if set).
 -   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -23,7 +23,7 @@
 #endif
 
 struct quantize_stats_params {
-    std::string model = "models/7B/ggml-model-f16.gguf";
+    std::string model = DEFAULT_MODEL_PATH;
     bool verbose = false;
     bool per_layer_stats = false;
     bool print_histogram = false;
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
@@ -2353,7 +2353,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
         printf("                            disable KV offload\n");
     }
     printf("  -m FNAME, --model FNAME\n");
-    printf("                            model path (default: %s)\n", params.model.c_str());
+    printf("                            model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);
     printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
     printf("                            model download url (default: unused)\n");
     printf("  -hfr REPO, --hf-repo REPO\n");
@@ -2835,6 +2835,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
         }
     }
 
+    gpt_params_handle_model_default(params);
+
     if (!params.kv_overrides.empty()) {
         params.kv_overrides.emplace_back();
         params.kv_overrides.back().key[0] = 0;
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
@@ -5,7 +5,7 @@ Feature: llama.cpp server
   Background: Server startup
     Given a server listening on localhost:8080
     And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
-    And   a model file ggml-model-f16.gguf
+    And   a model file bert-bge-small.gguf
     And   a model alias bert-bge-small
     And   42 as server seed
     And   2 slots

-Original file line number
+Diff line change
 *.a
 *.so
 *.gguf
 +*.gguf.json
 *.bin
 *.exe
 *.dll
Original file line number	Diff line number	Diff line change
`@@ -2353,7 +2353,7 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co`
`2353`	`2353`	`printf(" disable KV offload\n");`
`2354`	`2354`	`}`
`2355`	`2355`	`printf(" -m FNAME, --model FNAME\n");`
`2356`		`- printf(" model path (default: %s)\n", params.model.c_str());`
	`2356`	`+ printf(" model path (default: models/$filename with filename from --hf-file or --model-url if set, otherwise %s)\n", DEFAULT_MODEL_PATH);`
`2357`	`2357`	`printf(" -mu MODEL_URL, --model-url MODEL_URL\n");`
`2358`	`2358`	`printf(" model download url (default: unused)\n");`
`2359`	`2359`	`printf(" -hfr REPO, --hf-repo REPO\n");`
`@@ -2835,6 +2835,8 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,`
`2835`	`2835`	`}`
`2836`	`2836`	`}`
`2837`	`2837`
	`2838`	`+ gpt_params_handle_model_default(params);`
	`2839`	`+`
`2838`	`2840`	`if (!params.kv_overrides.empty()) {`
`2839`	`2841`	`params.kv_overrides.emplace_back();`
`2840`	`2842`	`params.kv_overrides.back().key[0] = 0;`