Skip to content

Commit 0fc18d2

Browse files
committed
Merge branch 'upstream' into concedo_experimental
# Conflicts: # .devops/nix/package.nix # CMakePresets.json # README.md # flake.lock # ggml/src/CMakeLists.txt # tests/test-backend-ops.cpp # tests/test-chat-template.cpp
2 parents 82202ae + 023b880 commit 0fc18d2

25 files changed

+1455
-1370
lines changed

common/common.cpp

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1015,16 +1015,19 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
10151015
}
10161016
if (arg == "--in-prefix-bos") {
10171017
params.input_prefix_bos = true;
1018+
params.enable_chat_template = false;
10181019
return true;
10191020
}
10201021
if (arg == "--in-prefix") {
10211022
CHECK_ARG
10221023
params.input_prefix = argv[i];
1024+
params.enable_chat_template = false;
10231025
return true;
10241026
}
10251027
if (arg == "--in-suffix") {
10261028
CHECK_ARG
10271029
params.input_suffix = argv[i];
1030+
params.enable_chat_template = false;
10281031
return true;
10291032
}
10301033
if (arg == "--spm-infill") {
@@ -1407,7 +1410,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
14071410
"halt generation at PROMPT, return control in interactive mode\n"
14081411
"can be specified more than once for multiple prompts" });
14091412
options.push_back({ "main", "-sp, --special", "special tokens output enabled (default: %s)", params.special ? "true" : "false" });
1410-
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix) (default: %s)", params.conversation ? "true" : "false" });
1413+
options.push_back({ "main", "-cnv, --conversation", "run in conversation mode (does not print special tokens and suffix/prefix, use default chat template) (default: %s)", params.conversation ? "true" : "false" });
14111414
options.push_back({ "main infill", "-i, --interactive", "run in interactive mode (default: %s)", params.interactive ? "true" : "false" });
14121415
options.push_back({ "main infill", "-if, --interactive-first", "run in interactive mode and wait for input right away (default: %s)", params.interactive_first ? "true" : "false" });
14131416
options.push_back({ "main infill", "-mli, --multiline-input", "allows you to write or paste multiple lines without ending each in '\\'" });
@@ -2669,12 +2672,19 @@ std::string llama_chat_format_single(const struct llama_model * model,
26692672
const std::vector<llama_chat_msg> & past_msg,
26702673
const llama_chat_msg & new_msg,
26712674
bool add_ass) {
2675+
std::ostringstream ss;
26722676
auto fmt_past_msg = llama_chat_apply_template(model, tmpl, past_msg, false);
26732677
std::vector<llama_chat_msg> chat_new(past_msg);
2678+
// if the past_msg ends with a newline, we must preserve it in the formatted version
2679+
if (add_ass && !fmt_past_msg.empty() && fmt_past_msg.back() == '\n') {
2680+
ss << "\n";
2681+
};
2682+
// format chat with new_msg
26742683
chat_new.push_back(new_msg);
26752684
auto fmt_new_msg = llama_chat_apply_template(model, tmpl, chat_new, add_ass);
2676-
auto formatted = fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
2677-
return formatted;
2685+
// get the diff part
2686+
ss << fmt_new_msg.substr(fmt_past_msg.size(), fmt_new_msg.size() - fmt_past_msg.size());
2687+
return ss.str();
26782688
}
26792689

26802690
std::string llama_chat_format_example(const struct llama_model * model,

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,7 @@ struct gpt_params {
217217
std::string public_path = "";
218218
std::string chat_template = "";
219219
std::string system_prompt = "";
220+
bool enable_chat_template = true;
220221

221222
std::vector<std::string> api_keys;
222223

convert-hf-to-gguf.py

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -576,7 +576,19 @@ def _set_vocab_qwen(self):
576576
special_vocab._set_special_token("unk", tokenizer.special_tokens["<|endoftext|>"])
577577
special_vocab.add_to_gguf(self.gguf_writer)
578578

579-
def _set_vocab_sentencepiece(self):
579+
def _set_vocab_sentencepiece(self, add_to_gguf=True):
580+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
581+
582+
self.gguf_writer.add_tokenizer_model("llama")
583+
self.gguf_writer.add_tokenizer_pre("default")
584+
self.gguf_writer.add_token_list(tokens)
585+
self.gguf_writer.add_token_scores(scores)
586+
self.gguf_writer.add_token_types(toktypes)
587+
588+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
589+
special_vocab.add_to_gguf(self.gguf_writer)
590+
591+
def _create_vocab_sentencepiece(self):
580592
from sentencepiece import SentencePieceProcessor
581593

582594
tokenizer_path = self.dir_model / 'tokenizer.model'
@@ -638,14 +650,7 @@ def _set_vocab_sentencepiece(self):
638650
scores.append(-1000.0)
639651
toktypes.append(SentencePieceTokenTypes.UNUSED)
640652

641-
self.gguf_writer.add_tokenizer_model("llama")
642-
self.gguf_writer.add_tokenizer_pre("default")
643-
self.gguf_writer.add_token_list(tokens)
644-
self.gguf_writer.add_token_scores(scores)
645-
self.gguf_writer.add_token_types(toktypes)
646-
647-
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
648-
special_vocab.add_to_gguf(self.gguf_writer)
653+
return tokens, scores, toktypes
649654

650655
def _set_vocab_llama_hf(self):
651656
vocab = gguf.LlamaHfVocab(self.dir_model)
@@ -2345,7 +2350,19 @@ class Gemma2Model(Model):
23452350
model_arch = gguf.MODEL_ARCH.GEMMA2
23462351

23472352
def set_vocab(self):
2348-
self._set_vocab_llama_hf()
2353+
tokens, scores, toktypes = self._create_vocab_sentencepiece()
2354+
# hack: This is required so that we can properly use start/end-of-turn for chat template
2355+
for i in range(108):
2356+
# including <unusedX>, <start_of_turn>, <end_of_turn>
2357+
toktypes[i] = SentencePieceTokenTypes.CONTROL
2358+
self.gguf_writer.add_tokenizer_model("llama")
2359+
self.gguf_writer.add_tokenizer_pre("default")
2360+
self.gguf_writer.add_token_list(tokens)
2361+
self.gguf_writer.add_token_scores(scores)
2362+
self.gguf_writer.add_token_types(toktypes)
2363+
2364+
special_vocab = gguf.SpecialVocab(self.dir_model, n_vocab=len(tokens))
2365+
special_vocab.add_to_gguf(self.gguf_writer)
23492366
self.gguf_writer.add_add_space_prefix(False)
23502367

23512368
def set_gguf_parameters(self):
@@ -2369,6 +2386,12 @@ def set_gguf_parameters(self):
23692386
self.gguf_writer.add_final_logit_softcapping(
23702387
self.hparams["final_logit_softcapping"]
23712388
)
2389+
self.gguf_writer.add_sliding_window(self.hparams["sliding_window"])
2390+
2391+
# sanity check
2392+
attn_scalar = self.hparams["query_pre_attn_scalar"]
2393+
if attn_scalar != hparams["hidden_size"] / hparams["num_attention_heads"]:
2394+
raise ValueError("query_pre_attn_scalar must be equal to n_embd / n_head")
23722395

23732396
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
23742397
del bid # unusem
@@ -3097,7 +3120,8 @@ def main() -> None:
30973120
"auto": gguf.LlamaFileType.GUESSED,
30983121
}
30993122

3100-
if args.use_temp_file and (args.split_max_tensors > 0 or args.split_max_size != "0"):
3123+
is_split = args.split_max_tensors > 0 or args.split_max_size != "0"
3124+
if args.use_temp_file and is_split:
31013125
logger.error("Error: Cannot use temp file when splitting")
31023126
sys.exit(1)
31033127

@@ -3134,11 +3158,12 @@ def main() -> None:
31343158
if args.vocab_only:
31353159
logger.info("Exporting model vocab...")
31363160
model_instance.write_vocab()
3137-
logger.info("Model vocab successfully exported.")
3161+
logger.info(f"Model vocab successfully exported to {model_instance.fname_out}")
31383162
else:
31393163
logger.info("Exporting model...")
31403164
model_instance.write()
3141-
logger.info("Model successfully exported.")
3165+
out_path = f"{model_instance.fname_out.parent}{os.sep}" if is_split else model_instance.fname_out
3166+
logger.info(f"Model successfully exported to {out_path}")
31423167

31433168

31443169
if __name__ == '__main__':

examples/main/main.cpp

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ int main(int argc, char ** argv) {
262262
std::vector<llama_token> embd_inp;
263263

264264
{
265-
auto prompt = params.conversation
265+
auto prompt = (params.conversation && params.enable_chat_template)
266266
? chat_add_and_format(model, chat_msgs, "system", params.prompt) // format the system prompt in conversation mode
267267
: params.prompt;
268268
if (params.interactive_first || !params.prompt.empty() || session_tokens.empty()) {
@@ -811,7 +811,9 @@ int main(int argc, char ** argv) {
811811
is_antiprompt = true;
812812
}
813813

814-
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
814+
if (params.enable_chat_template) {
815+
chat_add_and_format(model, chat_msgs, "assistant", assistant_ss.str());
816+
}
815817
is_interacting = true;
816818
printf("\n");
817819
}
@@ -873,12 +875,13 @@ int main(int argc, char ** argv) {
873875
string_process_escapes(buffer);
874876
}
875877

876-
std::string user_inp = params.conversation
878+
bool format_chat = params.conversation && params.enable_chat_template;
879+
std::string user_inp = format_chat
877880
? chat_add_and_format(model, chat_msgs, "user", std::move(buffer))
878881
: std::move(buffer);
879882
// TODO: one inconvenient of current chat template implementation is that we can't distinguish between user input and special tokens (prefix/postfix)
880883
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
881-
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, params.conversation);
884+
const auto line_inp = ::llama_tokenize(ctx, user_inp, false, format_chat);
882885
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
883886

884887
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());

ggml/src/ggml-common.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -106,19 +106,19 @@ typedef sycl::half2 ggml_half2;
106106
#define QR6_K 2
107107

108108
#define QI2_XXS (QK_K / (4*QR2_XXS))
109-
#define QR2_XXS 8
109+
#define QR2_XXS 4
110110

111111
#define QI2_XS (QK_K / (4*QR2_XS))
112-
#define QR2_XS 8
112+
#define QR2_XS 4
113113

114114
#define QI2_S (QK_K / (4*QR2_S))
115-
#define QR2_S 8
115+
#define QR2_S 4
116116

117117
#define QI3_XXS (QK_K / (4*QR3_XXS))
118-
#define QR3_XXS 8
118+
#define QR3_XXS 4
119119

120120
#define QI3_XS (QK_K / (4*QR3_XS))
121-
#define QR3_XS 8
121+
#define QR3_XS 4
122122

123123
#define QI1_S (QK_K / (4*QR1_S))
124124
#define QR1_S 8
@@ -130,10 +130,10 @@ typedef sycl::half2 ggml_half2;
130130
#define QR4_NL 2
131131

132132
#define QI4_XS (QK_K / (4*QR4_XS))
133-
#define QR4_XS 8
133+
#define QR4_XS 2
134134

135135
#define QI3_S (QK_K / (4*QR3_S))
136-
#define QR3_S 8
136+
#define QR3_S 4
137137

138138
#endif // GGML_COMMON_DECL_CUDA || GGML_COMMON_DECL_HIP
139139

ggml/src/ggml-cuda.cu

Lines changed: 35 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1882,6 +1882,11 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18821882
bool use_mul_mat_q = ggml_is_quantized(src0->type)
18831883
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32;
18841884

1885+
// if mmvq is available it's a better choice than dmmv:
1886+
#ifndef GGML_CUDA_FORCE_DMMV
1887+
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1888+
#endif // GGML_CUDA_FORCE_DMMV
1889+
18851890
bool any_gpus_with_slow_fp16 = false;
18861891

18871892
if (split) {
@@ -1894,22 +1899,15 @@ static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor
18941899
}
18951900

18961901
const int cc = ggml_cuda_info().devices[id].cc;
1897-
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
18981902
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
18991903
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
19001904
}
19011905
} else {
19021906
const int cc = ggml_cuda_info().devices[ctx.device].cc;
1903-
use_mul_mat_vec_q = use_mul_mat_vec_q && cc >= MIN_CC_DP4A;
19041907
use_mul_mat_q = use_mul_mat_q && ggml_cuda_should_use_mmq(src0->type, cc, src1->ne[1]);
19051908
any_gpus_with_slow_fp16 = any_gpus_with_slow_fp16 || !fast_fp16_available(cc);
19061909
}
19071910

1908-
// if mmvq is available it's a better choice than dmmv:
1909-
#ifndef GGML_CUDA_FORCE_DMMV
1910-
use_dequantize_mul_mat_vec = use_dequantize_mul_mat_vec && !use_mul_mat_vec_q;
1911-
#endif // GGML_CUDA_FORCE_DMMV
1912-
19131911
// debug helpers
19141912
//printf("src0: %8d %8d %8d %8d\n", src0->ne[0], src0->ne[1], src0->ne[2], src0->ne[3]);
19151913
//printf(" %8d %8d %8d %8d\n", src0->nb[0], src0->nb[1], src0->nb[2], src0->nb[3]);
@@ -2717,27 +2715,40 @@ GGML_CALL static bool ggml_backend_cuda_supports_op(ggml_backend_t backend, cons
27172715
case GGML_OP_MUL_MAT:
27182716
case GGML_OP_MUL_MAT_ID:
27192717
{
2720-
struct ggml_tensor * a;
2721-
struct ggml_tensor * b;
2718+
struct ggml_tensor * a = op->src[0];
27222719
if (op->op == GGML_OP_MUL_MAT) {
2723-
a = op->src[0];
2724-
b = op->src[1];
2725-
} else {
2726-
a = op->src[2];
2727-
b = op->src[1];
2728-
}
2729-
if (a->ne[3] != b->ne[3]) {
2730-
return false;
2731-
}
2732-
ggml_type a_type = a->type;
2733-
if (a_type == GGML_TYPE_IQ2_XXS || a_type == GGML_TYPE_IQ2_XS || a_type == GGML_TYPE_IQ3_XXS ||
2734-
a_type == GGML_TYPE_IQ1_S || a_type == GGML_TYPE_IQ4_NL || a_type == GGML_TYPE_IQ3_S ||
2735-
a_type == GGML_TYPE_IQ1_M || a_type == GGML_TYPE_IQ2_S || a_type == GGML_TYPE_IQ4_XS) {
2736-
if (b->ne[1] == 1 && ggml_nrows(b) > 1) {
2720+
struct ggml_tensor * b = op->src[1];
2721+
if (a->ne[3] != b->ne[3]) {
27372722
return false;
27382723
}
27392724
}
2740-
return true;
2725+
switch (a->type) {
2726+
case GGML_TYPE_F32:
2727+
case GGML_TYPE_F16:
2728+
case GGML_TYPE_Q4_0:
2729+
case GGML_TYPE_Q4_1:
2730+
case GGML_TYPE_Q5_0:
2731+
case GGML_TYPE_Q5_1:
2732+
case GGML_TYPE_Q8_0:
2733+
case GGML_TYPE_Q2_K:
2734+
case GGML_TYPE_Q3_K:
2735+
case GGML_TYPE_Q4_K:
2736+
case GGML_TYPE_Q5_K:
2737+
case GGML_TYPE_Q6_K:
2738+
case GGML_TYPE_Q8_K:
2739+
case GGML_TYPE_IQ1_M:
2740+
case GGML_TYPE_IQ1_S:
2741+
case GGML_TYPE_IQ2_S:
2742+
case GGML_TYPE_IQ2_XS:
2743+
case GGML_TYPE_IQ2_XXS:
2744+
case GGML_TYPE_IQ3_S:
2745+
case GGML_TYPE_IQ3_XXS:
2746+
case GGML_TYPE_IQ4_NL:
2747+
case GGML_TYPE_IQ4_XS:
2748+
return true;
2749+
default:
2750+
return false;
2751+
}
27412752
} break;
27422753
case GGML_OP_GET_ROWS:
27432754
{

0 commit comments

Comments
 (0)