Skip to content

Commit 5b605d0

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .github/ISSUE_TEMPLATE/config.yml # .gitignore # CMakeLists.txt # CONTRIBUTING.md # Makefile # README.md # ci/run.sh # common/common.h # examples/main-cmake-pkg/CMakeLists.txt # ggml/src/CMakeLists.txt # models/ggml-vocab-bert-bge.gguf.inp # models/ggml-vocab-bert-bge.gguf.out # models/ggml-vocab-deepseek-coder.gguf.inp # models/ggml-vocab-deepseek-coder.gguf.out # models/ggml-vocab-deepseek-llm.gguf.inp # models/ggml-vocab-deepseek-llm.gguf.out # models/ggml-vocab-falcon.gguf.inp # models/ggml-vocab-falcon.gguf.out # models/ggml-vocab-gpt-2.gguf.inp # models/ggml-vocab-gpt-2.gguf.out # models/ggml-vocab-llama-bpe.gguf.inp # models/ggml-vocab-llama-bpe.gguf.out # models/ggml-vocab-llama-spm.gguf.inp # models/ggml-vocab-llama-spm.gguf.out # models/ggml-vocab-mpt.gguf.inp # models/ggml-vocab-mpt.gguf.out # models/ggml-vocab-phi-3.gguf.inp # models/ggml-vocab-phi-3.gguf.out # models/ggml-vocab-starcoder.gguf.inp # models/ggml-vocab-starcoder.gguf.out # requirements.txt # requirements/requirements-convert_legacy_llama.txt # scripts/check-requirements.sh # scripts/pod-llama.sh # src/CMakeLists.txt # src/llama.cpp # tests/test-rope.cpp
2 parents 388a2af + 7ed03b8 commit 5b605d0

File tree

85 files changed

+4150
-996
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

85 files changed

+4150
-996
lines changed

README.md

Lines changed: 69 additions & 63 deletions
Large diffs are not rendered by default.

colab.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@
4848
"source": [
4949
"#@title <b>v-- Enter your model below and then click this to start Koboldcpp</b>\r\n",
5050
"\r\n",
51-
"Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\"]{allow-input: true}\r\n",
51+
"Model = \"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\" #@param [\"https://huggingface.co/KoboldAI/LLaMA2-13B-Tiefighter-GGUF/resolve/main/LLaMA2-13B-Tiefighter.Q4_K_S.gguf\",\"https://huggingface.co/KoboldAI/LLaMA2-13B-Estopia-GGUF/resolve/main/LLaMA2-13B-Estopia.Q4_K_S.gguf\",\"https://huggingface.co/Sao10K/Fimbulvetr-11B-v2-GGUF/resolve/main/Fimbulvetr-11B-v2-Test-14.q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-13B-GGUF/resolve/main/mythomax-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/ReMM-SLERP-L2-13B-GGUF/resolve/main/remm-slerp-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Xwin-LM-13B-v0.2-GGUF/resolve/main/xwin-lm-13b-v0.2.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/Stheno-L2-13B-GGUF/resolve/main/stheno-l2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MythoMax-L2-Kimiko-v2-13B-GGUF/resolve/main/mythomax-l2-kimiko-v2-13b.Q4_K_M.gguf\",\"https://huggingface.co/TheBloke/MistRP-Airoboros-7B-GGUF/resolve/main/mistrp-airoboros-7b.Q4_K_S.gguf\",\"https://huggingface.co/TheBloke/airoboros-mistral2.2-7B-GGUF/resolve/main/airoboros-mistral2.2-7b.Q4_K_S.gguf\",\"https://huggingface.co/concedo/KobbleTinyV2-1.1B-GGUF/resolve/main/KobbleTiny-Q4_K.gguf\",\"https://huggingface.co/grimjim/kukulemon-7B-GGUF/resolve/main/kukulemon-7B.Q8_0.gguf\",\"https://huggingface.co/mradermacher/LemonKunoichiWizardV3-GGUF/resolve/main/LemonKunoichiWizardV3.Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Kunoichi-DPO-v2-7B-GGUF-Imatrix/resolve/main/Kunoichi-DPO-v2-7B-Q4_K_M-imatrix.gguf\",\"https://huggingface.co/mradermacher/L3-8B-Stheno-v3.2-i1-GGUF/resolve/main/L3-8B-Stheno-v3.2.i1-Q4_K_M.gguf\",\"https://huggingface.co/Lewdiculous/Llama-3-Lumimaid-8B-v0.1-OAS-GGUF-IQ-Imatrix/resolve/main/v2-Llama-3-Lumimaid-8B-v0.1-OAS-Q4_K_M-imat.gguf\",\"https://huggingface.co/bartowski/NeuralDaredevil-8B-abliterated-GGUF/resolve/main/NeuralDaredevil-8B-abliterated-Q4_K_M.gguf\",\"https://huggingface.co/bartowski/L3-8B-Lunaris-v1-GGUF/resolve/main/L3-8B-Lunaris-v1-Q4_K_M.gguf\",\"https://huggingface.co/mradermacher/L3-Umbral-Mind-RP-v2.0-8B-GGUF/resolve/main/L3-Umbral-Mind-RP-v2.0-8B.Q4_K_M.gguf\"]{allow-input: true}\r\n",
5252
"Layers = 99 #@param [99]{allow-input: true}\r\n",
5353
"ContextSize = 4096 #@param [4096] {allow-input: true}\r\n",
5454
"#@markdown <hr>\r\n",

common/common.cpp

Lines changed: 38 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -473,6 +473,14 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
473473
else { invalid_param = true; }
474474
return true;
475475
}
476+
if (arg == "--attention") {
477+
CHECK_ARG
478+
std::string value(argv[i]);
479+
/**/ if (value == "causal") { params.attention_type = LLAMA_ATTENTION_TYPE_CAUSAL; }
480+
else if (value == "non-causal") { params.attention_type = LLAMA_ATTENTION_TYPE_NON_CAUSAL; }
481+
else { invalid_param = true; }
482+
return true;
483+
}
476484
if (arg == "--defrag-thold" || arg == "-dt") {
477485
CHECK_ARG
478486
params.defrag_thold = std::stof(argv[i]);
@@ -758,7 +766,7 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
758766
params.cache_type_v = argv[++i];
759767
return true;
760768
}
761-
if (arg == "--multiline-input") {
769+
if (arg == "-mli" || arg == "--multiline-input") {
762770
params.multiline_input = true;
763771
return true;
764772
}
@@ -1395,7 +1403,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
13951403
options.push_back({ "*", " --keep N", "number of tokens to keep from the initial prompt (default: %d, -1 = all)", params.n_keep });
13961404
options.push_back({ "*", " --chunks N", "max number of chunks to process (default: %d, -1 = all)", params.n_chunks });
13971405
options.push_back({ "*", "-fa, --flash-attn", "enable Flash Attention (default: %s)", params.flash_attn ? "enabled" : "disabled" });
1398-
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with (default: '%s')", params.prompt.c_str() });
1406+
options.push_back({ "*", "-p, --prompt PROMPT", "prompt to start generation with\n"
1407+
"in conversation mode, this will be used as system prompt\n"
1408+
"(default: '%s')", params.prompt.c_str() });
13991409
options.push_back({ "*", "-f, --file FNAME", "a file containing the prompt (default: none)" });
14001410
options.push_back({ "*", " --in-file FNAME", "an input file (repeat to specify multiple files)" });
14011411
options.push_back({ "*", "-bf, --binary-file FNAME", "binary file containing the prompt (default: none)" });
@@ -1410,7 +1420,9 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14101420
"halt generation at PROMPT, return control in interactive mode\n"
14111421
"can be specified more than once for multiple prompts" });
14121422
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
1413-
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
1423+
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode, does not print special tokens and suffix/prefix\n"
1424+
"if suffix/prefix are not specified, default chat template will be used\n"
1425+
"(default: %s)", params.conversation ? "true" : "false" });
14141426
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
14151427
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
14161428
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
@@ -1454,6 +1466,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14541466
options.push_back({ "main", " --cfg-scale N", "strength of guidance (default: %.1f, 1.0 = disable)", (double)sparams.cfg_scale });
14551467
options.push_back({ "main", " --chat-template JINJA_TEMPLATE",
14561468
"set custom jinja chat template (default: template taken from model's metadata)\n"
1469+
"if suffix/prefix are specified, template will be disabled\n"
14571470
"only commonly used templates are accepted:\n"
14581471
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
14591472
options.push_back({ "grammar" });
@@ -1464,8 +1477,10 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14641477
"For schemas w/ external $refs, use --grammar + example/json_schema_to_grammar.py instead" });
14651478

14661479
options.push_back({ "embedding" });
1467-
options.push_back({ "embedding", " --pooling {none,mean,cls}",
1480+
options.push_back({ "embedding", " --pooling {none,mean,cls,last}",
14681481
"pooling type for embeddings, use model default if unspecified" });
1482+
options.push_back({ "embedding", " --attention {causal,non-causal}",
1483+
"attention type for embeddings, use model default if unspecified" });
14691484

14701485
options.push_back({ "context hacking" });
14711486
options.push_back({ "*", " --rope-scaling {none,linear,yarn}",
@@ -2071,7 +2086,24 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20712086
if (params.warmup) {
20722087
LOG("warming up the model with an empty run\n");
20732088

2074-
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
2089+
std::vector<llama_token> tmp;
2090+
llama_token bos = llama_token_bos(model);
2091+
llama_token eos = llama_token_eos(model);
2092+
// some models (e.g. T5) don't have a BOS token
2093+
if (bos != -1) {
2094+
tmp.push_back(bos);
2095+
}
2096+
tmp.push_back(eos);
2097+
2098+
if (llama_model_has_encoder(model)) {
2099+
llama_encode(lctx, llama_batch_get_one(tmp.data(), tmp.size(), 0, 0));
2100+
llama_token decoder_start_token_id = llama_model_decoder_start_token(model);
2101+
if (decoder_start_token_id == -1) {
2102+
decoder_start_token_id = bos;
2103+
}
2104+
tmp.clear();
2105+
tmp.push_back(decoder_start_token_id);
2106+
}
20752107
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
20762108
llama_kv_cache_clear(lctx);
20772109
llama_synchronize(lctx);
@@ -2154,6 +2186,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
21542186
cparams.yarn_beta_slow = params.yarn_beta_slow;
21552187
cparams.yarn_orig_ctx = params.yarn_orig_ctx;
21562188
cparams.pooling_type = params.pooling_type;
2189+
cparams.attention_type = params.attention_type;
21572190
cparams.defrag_thold = params.defrag_thold;
21582191
cparams.cb_eval = params.cb_eval;
21592192
cparams.cb_eval_user_data = params.cb_eval_user_data;

common/common.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ struct gpt_params {
9595
enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
9696
enum llama_rope_scaling_type rope_scaling_type = LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
9797
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
98+
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
9899

99100
// sampling parameters
100101
int32_t top_k = 40; // <= 0 to use vocab size
@@ -475,5 +476,3 @@ void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const cha
475476
void yaml_dump_non_result_info(
476477
FILE * stream, const gpt_params & params, const llama_context * lctx,
477478
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
478-
479-

0 commit comments

Comments
 (0)