@@ -647,6 +647,22 @@ static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg,
647
647
params.model = argv[i];
648
648
return true ;
649
649
}
650
+ if (arg == " -md" || arg == " --model-draft" ) {
651
+ if (++i >= argc) {
652
+ invalid_param = true ;
653
+ return true ;
654
+ }
655
+ params.model_draft = argv[i];
656
+ return true ;
657
+ }
658
+ if (arg == " -a" || arg == " --alias" ) {
659
+ if (++i >= argc) {
660
+ invalid_param = true ;
661
+ return true ;
662
+ }
663
+ params.model_alias = argv[i];
664
+ return true ;
665
+ }
650
666
if (arg == " -mu" || arg == " --model-url" ) {
651
667
if (++i >= argc) {
652
668
invalid_param = true ;
@@ -655,20 +671,20 @@ static bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg,
655
671
params.model_url = argv[i];
656
672
return true ;
657
673
}
658
- if (arg == " -md " || arg == " --model-draft " ) {
674
+ if (arg == " -hfr " || arg == " --hf-repo " ) {
659
675
if (++i >= argc) {
660
676
invalid_param = true ;
661
677
return true ;
662
678
}
663
- params.model_draft = argv[i];
679
+ params.hf_repo = argv[i];
664
680
return true ;
665
681
}
666
- if (arg == " -a " || arg == " --alias " ) {
682
+ if (arg == " -hff " || arg == " --hf-file " ) {
667
683
if (++i >= argc) {
668
684
invalid_param = true ;
669
685
return true ;
670
686
}
671
- params.model_alias = argv[i];
687
+ params.hf_file = argv[i];
672
688
return true ;
673
689
}
674
690
if (arg == " --lora" ) {
@@ -1403,10 +1419,14 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
1403
1419
printf (" layer range to apply the control vector(s) to, start and end inclusive\n " );
1404
1420
printf (" -m FNAME, --model FNAME\n " );
1405
1421
printf (" model path (default: %s)\n " , params.model .c_str ());
1406
- printf (" -mu MODEL_URL, --model-url MODEL_URL\n " );
1407
- printf (" model download url (default: %s)\n " , params.model_url .c_str ());
1408
1422
printf (" -md FNAME, --model-draft FNAME\n " );
1409
- printf (" draft model for speculative decoding\n " );
1423
+ printf (" draft model for speculative decoding (default: unused)\n " );
1424
+ printf (" -mu MODEL_URL, --model-url MODEL_URL\n " );
1425
+ printf (" model download url (default: unused)\n " );
1426
+ printf (" -hfr REPO, --hf-repo REPO\n " );
1427
+ printf (" Hugging Face model repository (default: unused)\n " );
1428
+ printf (" -hff FILE, --hf-file FILE\n " );
1429
+ printf (" Hugging Face model file (default: unused)\n " );
1410
1430
printf (" -ld LOGDIR, --logdir LOGDIR\n " );
1411
1431
printf (" path under which to save YAML logs (no logging if unset)\n " );
1412
1432
printf (" --override-kv KEY=TYPE:VALUE\n " );
@@ -1655,8 +1675,10 @@ void llama_batch_add(
1655
1675
1656
1676
#ifdef LLAMA_USE_CURL
1657
1677
1658
- struct llama_model * llama_load_model_from_url (const char * model_url, const char * path_model,
1659
- struct llama_model_params params) {
1678
+ struct llama_model * llama_load_model_from_url (
1679
+ const char * model_url,
1680
+ const char * path_model,
1681
+ const struct llama_model_params & params) {
1660
1682
// Basic validation of the model_url
1661
1683
if (!model_url || strlen (model_url) == 0 ) {
1662
1684
fprintf (stderr, " %s: invalid model_url\n " , __func__);
@@ -1850,25 +1872,62 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
1850
1872
return llama_load_model_from_file (path_model, params);
1851
1873
}
1852
1874
1875
+ struct llama_model * llama_load_model_from_hf (
1876
+ const char * repo,
1877
+ const char * model,
1878
+ const char * path_model,
1879
+ const struct llama_model_params & params) {
1880
+ // construct hugging face model url:
1881
+ //
1882
+ // --repo ggml-org/models --file tinyllama-1.1b/ggml-model-f16.gguf
1883
+ // https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf
1884
+ //
1885
+ // --repo TheBloke/Mixtral-8x7B-v0.1-GGUF --file mixtral-8x7b-v0.1.Q4_K_M.gguf
1886
+ // https://huggingface.co/TheBloke/Mixtral-8x7B-v0.1-GGUF/resolve/main/mixtral-8x7b-v0.1.Q4_K_M.gguf
1887
+ //
1888
+
1889
+ std::string model_url = " https://huggingface.co/" ;
1890
+ model_url += repo;
1891
+ model_url += " /resolve/main/" ;
1892
+ model_url += model;
1893
+
1894
+ return llama_load_model_from_url (model_url.c_str (), path_model, params);
1895
+ }
1896
+
1853
1897
#else
1854
1898
1855
- struct llama_model * llama_load_model_from_url (const char * /* model_url*/ , const char * /* path_model*/ ,
1856
- struct llama_model_params /* params*/ ) {
1899
+ struct llama_model * llama_load_model_from_url (
1900
+ const char * /* model_url*/ ,
1901
+ const char * /* path_model*/ ,
1902
+ const struct llama_model_params & /* params*/ ) {
1857
1903
fprintf (stderr, " %s: llama.cpp built without libcurl, downloading from an url not supported.\n " , __func__);
1858
1904
return nullptr ;
1859
1905
}
1860
1906
1907
+ struct llama_model * llama_load_model_from_hf (
1908
+ const char * /* repo*/ ,
1909
+ const char * /* model*/ ,
1910
+ const char * /* path_model*/ ,
1911
+ const struct llama_model_params & /* params*/ ) {
1912
+ fprintf (stderr, " %s: llama.cpp built without libcurl, downloading from Hugging Face not supported.\n " , __func__);
1913
+ return nullptr ;
1914
+ }
1915
+
1861
1916
#endif // LLAMA_USE_CURL
1862
1917
1863
1918
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params (gpt_params & params) {
1864
1919
auto mparams = llama_model_params_from_gpt_params (params);
1865
1920
1866
1921
llama_model * model = nullptr ;
1867
- if (!params.model_url .empty ()) {
1922
+
1923
+ if (!params.hf_repo .empty () && !params.hf_file .empty ()) {
1924
+ model = llama_load_model_from_hf (params.hf_repo .c_str (), params.hf_file .c_str (), params.model .c_str (), mparams);
1925
+ } else if (!params.model_url .empty ()) {
1868
1926
model = llama_load_model_from_url (params.model_url .c_str (), params.model .c_str (), mparams);
1869
1927
} else {
1870
1928
model = llama_load_model_from_file (params.model .c_str (), mparams);
1871
1929
}
1930
+
1872
1931
if (model == NULL ) {
1873
1932
fprintf (stderr, " %s: error: failed to load model '%s'\n " , __func__, params.model .c_str ());
1874
1933
return std::make_tuple (nullptr , nullptr );
@@ -1908,7 +1967,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
1908
1967
}
1909
1968
1910
1969
for (unsigned int i = 0 ; i < params.lora_adapter .size (); ++i) {
1911
- const std::string& lora_adapter = std::get<0 >(params.lora_adapter [i]);
1970
+ const std::string & lora_adapter = std::get<0 >(params.lora_adapter [i]);
1912
1971
float lora_scale = std::get<1 >(params.lora_adapter [i]);
1913
1972
int err = llama_model_apply_lora_from_file (model,
1914
1973
lora_adapter.c_str (),
0 commit comments