ggml-org
diff --git a/‎common/arg.cpp
Lines changed: 127 additions & 98 deletions b/‎common/arg.cpp
Lines changed: 127 additions & 98 deletions
diff --git a/‎common/chat-parser.cpp
Lines changed: 7 additions & 4 deletions b/‎common/chat-parser.cpp
Lines changed: 7 additions & 4 deletions
diff --git a/‎common/chat-parser.h
Lines changed: 2 additions & 1 deletion b/‎common/chat-parser.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎common/chat.cpp
Lines changed: 181 additions & 131 deletions b/‎common/chat.cpp
Lines changed: 181 additions & 131 deletions
diff --git a/‎common/chat.h
Lines changed: 4 additions & 1 deletion b/‎common/chat.h
Lines changed: 4 additions & 1 deletion
diff --git a/‎common/common.h
Lines changed: 2 additions & 0 deletions b/‎common/common.h
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/backend/CANN.md
100644100755
Lines changed: 9 additions & 0 deletions b/‎docs/backend/CANN.md
100644100755
Lines changed: 9 additions & 0 deletions
diff --git a/‎examples/embedding/embedding.cpp
Lines changed: 2 additions & 2 deletions b/‎examples/embedding/embedding.cpp
Lines changed: 2 additions & 2 deletions
diff --git a/‎examples/retrieval/retrieval.cpp
Lines changed: 6 additions & 6 deletions b/‎examples/retrieval/retrieval.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎examples/training/README.md
Lines changed: 2 additions & 2 deletions b/‎examples/training/README.md
Lines changed: 2 additions & 2 deletions
diff --git a/‎ggml/src/ggml-cann/CMakeLists.txt
100644100755 b/‎ggml/src/ggml-cann/CMakeLists.txt
100644100755
diff --git a/‎ggml/src/ggml-cann/Doxyfile
100644100755 b/‎ggml/src/ggml-cann/Doxyfile
100644100755
diff --git a/‎ggml/src/ggml-cann/acl_tensor.cpp
100644100755
Lines changed: 2 additions & 0 deletions b/‎ggml/src/ggml-cann/acl_tensor.cpp
100644100755
Lines changed: 2 additions & 0 deletions
diff --git a/‎ggml/src/ggml-cann/acl_tensor.h
100644100755 b/‎ggml/src/ggml-cann/acl_tensor.h
100644100755
@@ -170,20 +170,23 @@ std::string common_chat_msg_parser::consume_rest() {
 }
 
 // Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.
-std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from) {
+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {
     auto m = regex.search(input_, from == std::string::npos ? pos_ : from);
     if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {
         return std::nullopt;
     }
+    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
+    pos_ = m.groups[0].end;
+
+    if (add_prelude_to_content) {
+        add_content(prelude);
+    }
     if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {
         if (is_partial()) {
             throw common_chat_msg_partial_exception(regex.str());
         }
         return std::nullopt;
     }
-    auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);
-    pos_ = m.groups[0].end;
-
     return find_regex_result{prelude, m.groups};
 }
 
 
@@ -30,6 +30,7 @@ class common_chat_msg_parser {
     const std::string & healing_marker() const { return healing_marker_; }
     const bool & is_partial() const { return is_partial_; }
     const common_chat_msg & result() const { return result_; }
+    const common_chat_syntax & syntax() const { return syntax_; }
 
     void move_to(size_t pos) {
         if (pos > input_.size()) {
@@ -77,7 +78,7 @@ class common_chat_msg_parser {
         std::vector<common_string_range> groups;
     };
 
-    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos);
+    std::optional<find_regex_result> try_find_regex(const common_regex & regex, size_t from = std::string::npos, bool add_prelude_to_content = true);
 
     bool try_consume_literal(const std::string & literal);
 
 
@@ -123,6 +123,7 @@ struct common_chat_templates_inputs {
     common_chat_tool_choice tool_choice = COMMON_CHAT_TOOL_CHOICE_AUTO;
     bool parallel_tool_calls = false;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_NONE;
+    bool enable_thinking = true;
     std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
 };
 
@@ -143,6 +144,7 @@ struct common_chat_syntax {
     // Whether reasoning_content should be inlined in the content (e.g. for reasoning_format=deepseek in stream mode)
     bool                     reasoning_in_content  = false;
     bool                     thinking_forced_open  = false;
+    bool                     parse_tool_calls      = true;
 };
 
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
@@ -181,7 +183,8 @@ std::string common_chat_format_example(
     const struct common_chat_templates * tmpls,
     bool use_jinja);
 
-std::string               common_chat_format_name(common_chat_format format);
+const char*               common_chat_format_name(common_chat_format format);
+const char*               common_reasoning_format_name(common_reasoning_format format);
 common_chat_msg           common_chat_parse(const std::string & input, bool is_partial, const common_chat_syntax & syntax);
 
 common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice);
 
@@ -291,6 +291,7 @@ struct common_params {
     int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
     int32_t control_vector_layer_end   = -1; // layer range for control vector
+    bool    offline                    = false;
 
     int32_t ppl_stride      = 0;     // stride for perplexity calculations. If left at 0, the pre-existing approach will be used.
     int32_t ppl_output_type = 0;     // = 0 -> ppl output is as usual, = 1 -> ppl output is num_tokens, ppl, one per line
@@ -368,6 +369,7 @@ struct common_params {
     bool use_jinja = false;                                                                                 // NOLINT
     bool enable_chat_template = true;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
+    int reasoning_budget = -1;
     bool prefill_assistant = true;                                                                          // if true, any trailing assistant message will be prefilled into the response
 
     std::vector<std::string> api_keys;
 
@@ -280,6 +280,15 @@ cmake --build build --config release
 ### **GitHub contribution**:
 Please add the **[CANN]** prefix/tag in issues/PRs titles to help the CANN-team check/address them without delay.
 
+## Updates
+### Basic Flash Attention Support
+The basic FA kernel with aclnnops has been added in aclnn_ops.cpp.
+Currently, the FA only supports the cases with FP16 KV tensors and NO logit softcap.
+Since the aclnn interface for flash attention cannot support the logit softcap, we will only update the quantized version in the future.
+
+Authors from Peking University: Bizhao Shi ([email protected]), Yuxin Yang ([email protected]), Ruiyang Ma ([email protected]), and Guojie Luo ([email protected]).
+
+We would like to thank Tuo Dai, Shanni Li, and all of the project maintainers from Huawei Technologies Co., Ltd for their help during the code development and pull request.
 
 ## TODO
 - Support more models and data types.
@@ -41,8 +41,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_encode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to encode\n", __func__);
+    if (llama_decode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to process\n", __func__);
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
 
@@ -81,14 +81,14 @@ static void batch_add_seq(llama_batch & batch, const std::vector<int32_t> & toke
     }
 }
 
-static void batch_encode(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
+static void batch_process(llama_context * ctx, llama_batch & batch, float * output, int n_seq, int n_embd) {
     // clear previous kv_cache values (irrelevant for embeddings)
     llama_kv_self_clear(ctx);
 
     // run model
     LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);
-    if (llama_encode(ctx, batch) < 0) {
-        LOG_ERR("%s : failed to encode\n", __func__);
+    if (llama_decode(ctx, batch) < 0) {
+        LOG_ERR("%s : failed to process\n", __func__);
     }
 
     for (int i = 0; i < batch.n_tokens; i++) {
@@ -233,7 +233,7 @@ int main(int argc, char ** argv) {
         // encode if at capacity
         if (batch.n_tokens + n_toks > n_batch) {
             float * out = emb + p * n_embd;
-            batch_encode(ctx, batch, out, s, n_embd);
+            batch_process(ctx, batch, out, s, n_embd);
             common_batch_clear(batch);
             p += s;
             s = 0;
@@ -246,7 +246,7 @@ int main(int argc, char ** argv) {
 
     // final batch
     float * out = emb + p * n_embd;
-    batch_encode(ctx, batch, out, s, n_embd);
+    batch_process(ctx, batch, out, s, n_embd);
 
     // save embeddings to chunks
     for (int i = 0; i < n_chunks; i++) {
@@ -267,7 +267,7 @@ int main(int argc, char ** argv) {
         batch_add_seq(query_batch, query_tokens, 0);
 
         std::vector<float> query_emb(n_embd, 0);
-        batch_encode(ctx, query_batch, query_emb.data(), 1, n_embd);
+        batch_process(ctx, query_batch, query_emb.data(), 1, n_embd);
 
         common_batch_clear(query_batch);
 
 
@@ -10,8 +10,8 @@ Proof of concept:
 
 ``` sh
 export model_name=llama_3.2-1b && export quantization=f32
-./build/bin/finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
-./build/bin/perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
+./build/bin/llama-finetune --file wikitext-2-raw/wiki.test.raw -ngl 999 --model models/${model_name}-${quantization}.gguf -c 512 -b 512 -ub 512
+./build/bin/llama-perplexity --file wikitext-2-raw/wiki.test.raw -ngl 999 --model finetuned-model.gguf
 ```
 
 The perplexity value of the finetuned model should be lower after training on the test set for 2 epochs.
@@ -31,6 +31,8 @@ aclDataType ggml_cann_type_mapping(ggml_type type) {
             return ACL_FLOAT;
         case GGML_TYPE_F16:
             return ACL_FLOAT16;
+        case GGML_TYPE_BF16:
+            return ACL_BF16;
         case GGML_TYPE_I8:
             return ACL_INT8;
         case GGML_TYPE_I16:
Original file line number	Diff line number	Diff line change
`@@ -170,20 +170,23 @@ std::string common_chat_msg_parser::consume_rest() {`
`170`	`170`	`}`
`171`	`171`
`172`	`172`	`// Tries to find the regex, consumes it (pos right after it) and gives the prelude (right before it) and the groups to the callback.`
`173`		`-std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from) {`
	`173`	`+std::optional<common_chat_msg_parser::find_regex_result> common_chat_msg_parser::try_find_regex(const common_regex & regex, size_t from, bool add_prelude_to_content) {`
`174`	`174`	`auto m = regex.search(input_, from == std::string::npos ? pos_ : from);`
`175`	`175`	`if (m.type == COMMON_REGEX_MATCH_TYPE_NONE) {`
`176`	`176`	`return std::nullopt;`
`177`	`177`	`}`
	`178`	`+ auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);`
	`179`	`+ pos_ = m.groups[0].end;`
	`180`	`+`
	`181`	`+ if (add_prelude_to_content) {`
	`182`	`+ add_content(prelude);`
	`183`	`+ }`
`178`	`184`	`if (m.type == COMMON_REGEX_MATCH_TYPE_PARTIAL) {`
`179`	`185`	`if (is_partial()) {`
`180`	`186`	`throw common_chat_msg_partial_exception(regex.str());`
`181`	`187`	`}`
`182`	`188`	`return std::nullopt;`
`183`	`189`	`}`
`184`		`- auto prelude = input_.substr(pos_, m.groups[0].begin - pos_);`
`185`		`- pos_ = m.groups[0].end;`
`186`		`-`
`187`	`190`	`return find_regex_result{prelude, m.groups};`
`188`	`191`	`}`
`189`	`192`
Original file line number	Diff line number	Diff line change
`@@ -41,8 +41,8 @@ static void batch_decode(llama_context * ctx, llama_batch & batch, float * outpu`
`41`	`41`
`42`	`42`	`// run model`
`43`	`43`	`LOG_INF("%s: n_tokens = %d, n_seq = %d\n", __func__, batch.n_tokens, n_seq);`
`44`		`- if (llama_encode(ctx, batch) < 0) {`
`45`		`- LOG_ERR("%s : failed to encode\n", __func__);`
	`44`	`+ if (llama_decode(ctx, batch) < 0) {`
	`45`	`+ LOG_ERR("%s : failed to process\n", __func__);`
`46`	`46`	`}`
`47`	`47`
`48`	`48`	`for (int i = 0; i < batch.n_tokens; i++) {`