ggml-org
diff --git a/‎src/llama-adapter.cpp
Lines changed: 3 additions & 2 deletions b/‎src/llama-adapter.cpp
Lines changed: 3 additions & 2 deletions
diff --git a/‎src/llama-context.cpp
Lines changed: 3 additions & 1 deletion b/‎src/llama-context.cpp
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/llama-grammar.cpp
Lines changed: 4 additions & 4 deletions b/‎src/llama-grammar.cpp
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/llama-kv-cache.cpp
Lines changed: 1 addition & 1 deletion b/‎src/llama-kv-cache.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama-mmap.cpp
Lines changed: 1 addition & 1 deletion b/‎src/llama-mmap.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/llama-model-loader.cpp
Lines changed: 61 additions & 0 deletions b/‎src/llama-model-loader.cpp
Lines changed: 61 additions & 0 deletions
diff --git a/‎src/llama-model-loader.h
Lines changed: 4 additions & 0 deletions b/‎src/llama-model-loader.h
Lines changed: 4 additions & 0 deletions
@@ -1,5 +1,6 @@
 #include "llama-adapter.h"
 
+#include "llama-mmap.h"
 #include "llama-model.h"
 
 #include <algorithm>
@@ -62,7 +63,7 @@ static bool llama_control_vector_init(struct llama_control_vector & cvec, const
     cvec.tensors.reserve(hparams.n_layer);
     cvec.tensors.push_back(nullptr); // there's never a tensor for layer 0
     for (size_t il = 1; il < hparams.n_layer; il++) {
-        ggml_backend_buffer_type_t buft = llama_model_select_buft(model, il);
+        ggml_backend_buffer_type_t buft = model.select_buft(il);
         ggml_context * ctx = ctx_for_buft(buft);
         if (!ctx) {
             LLAMA_LOG_ERROR("%s: failed to allocate context for control vector\n", __func__);
@@ -262,7 +263,7 @@ static void llama_lora_adapter_init_impl(struct llama_model & model, const char
         }
 
         // device buft and device ctx
-        auto * model_tensor = llama_model_get_tensor(model, name.c_str());
+        const auto * model_tensor = model.get_tensor(name.c_str());
         if (!model_tensor) {
             throw std::runtime_error("LoRA tensor '" + name + "' does not exist in base model (hint: maybe wrong base model?)");
         }
 
@@ -1,5 +1,7 @@
 #include "llama-context.h"
 
+#include "llama-mmap.h"
+
 #include <cassert>
 #include <cmath>
 #include <cstring>
@@ -504,7 +506,7 @@ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs) {
 
         auto * buft = ggml_backend_cpu_buffer_type();
         // try to use the host buffer of the device where the output tensor is allocated for faster transfer to system memory
-        auto * output_dev = lctx.model.dev_output.dev;
+        auto * output_dev = lctx.model.dev_output();
         auto * output_dev_host_buft = output_dev ? ggml_backend_dev_host_buffer_type(output_dev) : nullptr;
         if (output_dev_host_buft) {
             buft = output_dev_host_buft;
 
@@ -1092,9 +1092,9 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
 
     for (size_t i = 0; i < cur_p->size; ++i) {
         const llama_token id      = cur_p->data[i].id;
-        const std::string & piece = grammar.vocab->cache_token_to_piece.at(id);
+        const std::string & piece = grammar.vocab->token_to_piece(id);
 
-        if (llama_token_is_eog_impl(*grammar.vocab, id)) {
+        if (grammar.vocab->is_eog(id)) {
             if (!allow_eog) {
                 cur_p->data[i].logit = -INFINITY;
             }
@@ -1115,7 +1115,7 @@ void llama_grammar_apply_impl(const struct llama_grammar & grammar, llama_token_
 void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token) {
     GGML_ASSERT(grammar.vocab != nullptr);
 
-    if (llama_token_is_eog_impl(*grammar.vocab, token)) {
+    if (grammar.vocab->is_eog(token)) {
         for (const auto & stack : grammar.stacks) {
             if (stack.empty()) {
                 return;
@@ -1124,7 +1124,7 @@ void llama_grammar_accept_impl(struct llama_grammar & grammar, llama_token token
         GGML_ABORT("fatal error");
     }
 
-    const std::string & piece = grammar.vocab->cache_token_to_piece.at(token);
+    const std::string & piece = grammar.vocab->token_to_piece(token);
 
     // Note terminating 0 in decoded string
     const auto   decoded     = decode_utf8(piece, grammar.partial_utf8);
 
@@ -79,7 +79,7 @@ bool llama_kv_cache_init(
 
         ggml_backend_buffer_type_t buft;
         if (offload) {
-            auto * dev = model.dev_layer.at(i).dev;
+            auto * dev = model.dev_layer(i);
             buft = ggml_backend_dev_buffer_type(dev);
         } else {
             buft = ggml_backend_cpu_buffer_type();
 
@@ -35,7 +35,7 @@
 
 // TODO: consider moving to llama-impl.h if needed in more places
 #if defined(_WIN32)
-std::string llama_format_win_err(DWORD err) {
+static std::string llama_format_win_err(DWORD err) {
     LPSTR buf;
     size_t size = FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS,
                                  NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPSTR)&buf, 0, NULL);
 
@@ -7,6 +7,10 @@
 #include <cstring>
 #include <future>
 
+static const size_t kiB = 1024;
+static const size_t MiB = 1024*kiB;
+static const size_t GiB = 1024*MiB;
+
 const char * llama_file_version_name(llama_fver version) {
     switch (version) {
         case GGUF_FILE_VERSION_V1: return "GGUF V1 (support until nov 2023)";
@@ -17,6 +21,49 @@ const char * llama_file_version_name(llama_fver version) {
     return "unknown";
 }
 
+static std::string llama_model_ftype_name(llama_ftype ftype) {
+    if (ftype & LLAMA_FTYPE_GUESSED) {
+        return llama_model_ftype_name((enum llama_ftype) (ftype & ~LLAMA_FTYPE_GUESSED)) + " (guessed)";
+    }
+
+    switch (ftype) {
+        case LLAMA_FTYPE_ALL_F32:         return "all F32";
+        case LLAMA_FTYPE_MOSTLY_F16:      return "F16";
+        case LLAMA_FTYPE_MOSTLY_BF16:     return "BF16";
+        case LLAMA_FTYPE_MOSTLY_Q4_0:     return "Q4_0";
+        case LLAMA_FTYPE_MOSTLY_Q4_1:     return "Q4_1";
+        case LLAMA_FTYPE_MOSTLY_Q5_0:     return "Q5_0";
+        case LLAMA_FTYPE_MOSTLY_Q5_1:     return "Q5_1";
+        case LLAMA_FTYPE_MOSTLY_Q8_0:     return "Q8_0";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:     return "Q2_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S:   return "Q2_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_S:   return "Q3_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_M:   return "Q3_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q3_K_L:   return "Q3_K - Large";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_S:   return "Q4_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q4_K_M:   return "Q4_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_S:   return "Q5_K - Small";
+        case LLAMA_FTYPE_MOSTLY_Q5_K_M:   return "Q5_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q6_K:     return "Q6_K";
+        case LLAMA_FTYPE_MOSTLY_TQ1_0:    return "TQ1_0 - 1.69 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_TQ2_0:    return "TQ2_0 - 2.06 bpw ternary";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XXS:  return "IQ2_XXS - 2.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_XS:   return "IQ2_XS - 2.3125 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_S:    return "IQ2_S - 2.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ2_M:    return "IQ2_M - 2.7 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XS:   return "IQ3_XS - 3.3 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_XXS:  return "IQ3_XXS - 3.0625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_S:    return "IQ1_S - 1.5625 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ1_M:    return "IQ1_M - 1.75 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_NL:   return "IQ4_NL - 4.5 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ4_XS:   return "IQ4_XS - 4.25 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_S:    return "IQ3_S - 3.4375 bpw";
+        case LLAMA_FTYPE_MOSTLY_IQ3_M:    return "IQ3_S mix - 3.66 bpw";
+
+        default: return "unknown, may not work";
+    }
+}
+
 namespace GGUFMeta {
     template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int64_t)>
     struct GKV_Base_Type {
@@ -1009,3 +1056,17 @@ bool llama_model_loader::load_all_data(
 
     return true;
 }
+
+std::string llama_model_loader::ftype_name() const {
+    return llama_model_ftype_name(ftype);
+}
+
+void llama_model_loader::print_info() const {
+    LLAMA_LOG_INFO("%s: file format = %s\n", __func__, llama_file_version_name(fver));
+    LLAMA_LOG_INFO("%s: file type   = %s\n", __func__, llama_model_ftype_name(ftype).c_str());
+    if (n_bytes < GiB) {
+        LLAMA_LOG_INFO("%s: file size   = %.2f MiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0,        n_bytes*8.0/n_elements);
+    } else {
+        LLAMA_LOG_INFO("%s: file size   = %.2f GiB (%.2f BPW) \n", __func__, n_bytes/1024.0/1024.0/1024.0, n_bytes*8.0/n_elements);
+    }
+}
@@ -155,4 +155,8 @@ struct llama_model_loader {
             llama_mlocks * lmlocks,
             llama_progress_callback progress_callback,
             void * progress_callback_user_data);
+
+    std::string ftype_name() const;
+
+    void print_info() const;
 };