llama : impl

ggerganov · ggerganov · commit a7df0714dbbb · 2024-12-23T17:42:12.000+02:00
ggml-ci
diff --git a/common/common.h b/common/common.h
@@ -638,6 +638,10 @@ common_control_vector_data common_control_vector_load(const std::vector<common_c
 // Split utils
 //
 
-static const char * const LLM_KV_SPLIT_NO            = "split.no";
-static const char * const LLM_KV_SPLIT_COUNT         = "split.count";
-static const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+namespace {
+
+const char * const LLM_KV_SPLIT_NO            = "split.no";
+const char * const LLM_KV_SPLIT_COUNT         = "split.count";
+const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count";
+
+}
diff --git a/examples/gguf-split/gguf-split.cpp b/examples/gguf-split/gguf-split.cpp
@@ -2,15 +2,14 @@
 #include "common.h"
 
 #include <algorithm>
-#include <cmath>
 #include <cstdlib>
 #include <fstream>
 #include <string>
 #include <vector>
-
-#include <stdio.h>
-#include <string.h>
 #include <climits>
+
+#include <cstdio>
+#include <cstring>
 #include <stdexcept>
 
 #if defined(_WIN32)
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -1,19 +1,17 @@
-#include "common.h"
 #include "ggml.h"
 #include "llama.h"
-#include "llama-impl.h"
+#include "llama-context.h"
+#include "common.h"
 
 #include <algorithm>
 #include <cassert>
 #include <cinttypes>
 #include <cmath>
 #include <cstdio>
 #include <cstring>
-#include <map>
 #include <numeric>
 #include <regex>
 #include <string>
-#include <unordered_map>
 #include <vector>
 #include <thread>
 #include <mutex>
@@ -330,13 +328,13 @@ int main(int argc, char ** argv) {
         }
     }
 
-    const auto &tensors = llama_internal_get_tensor_map(ctx);
+    const auto & tensors = llama_internal_get_tensor_map(ctx);
 
     // check layer tensors
     int included_layers = 0;
     int64_t max_nelements = 0;
     bool is_f16 = false;
-    for (const auto& kv_tensor : tensors) {
+    for (const auto & kv_tensor : tensors) {
         if (!layer_included(params, kv_tensor.first)) {
             continue;
         }
@@ -371,8 +369,8 @@ int main(int argc, char ** argv) {
         if (!params.include_types.empty() && std::find(params.include_types.begin(), params.include_types.end(), i) == params.include_types.end()) {
             continue;
         }
-        const auto *  qfns     = ggml_get_type_traits(type);
-        const auto *  qfns_cpu = ggml_get_type_traits_cpu(type);
+        const auto * qfns     = ggml_get_type_traits(type);
+        const auto * qfns_cpu = ggml_get_type_traits_cpu(type);
         if (qfns_cpu->from_float && qfns->to_float) {
             if (params.verbose) {
                 printf("testing %s ...\n",  ggml_type_name(type));
@@ -382,7 +380,7 @@ int main(int argc, char ** argv) {
 
             error_stats global_stats {};
 
-            for (const auto& kv_tensor : tensors) {
+            for (const auto & kv_tensor : tensors) {
                 if (!layer_included(params, kv_tensor.first)) {
                     continue;
                 }
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -15,6 +15,7 @@ add_library(llama
             llama-chat.cpp
             llama-context.cpp
             llama-hparams.cpp
+            llama-impl.cpp
             llama-grammar.cpp
             llama-kv-cache.cpp
             llama-mmap.cpp
diff --git a/src/llama-adapter.cpp b/src/llama-adapter.cpp
@@ -5,6 +5,7 @@
 #include <algorithm>
 #include <map>
 #include <cassert>
+#include <stdexcept>
 
 // vec
 
diff --git a/src/llama-batch.h b/src/llama-batch.h
@@ -26,7 +26,9 @@ struct llama_ubatch {
 
 struct llama_sbatch_seq {
     int32_t n_seq_id;
+
     llama_seq_id * seq_id;
+
     size_t offset;
     size_t length;
 };
@@ -112,8 +114,8 @@ struct llama_sbatch {
             if (ubatch.equal_seqs) {
                 for (size_t i = 0; i < length; ++i) {
                     memcpy(
-                        ubatch.embd + n_embd * (ubatch.n_tokens + i),
-                        batch->embd + n_embd * ids[seq.offset + i],
+                        ubatch.embd + (n_embd * (ubatch.n_tokens + i)),
+                        batch->embd + (n_embd * ids[seq.offset + i]),
                         n_embd * sizeof(float)
                     );
                 }
diff --git a/src/llama-context.cpp b/src/llama-context.cpp
@@ -1,5 +1,7 @@
 #include "llama-context.h"
 
+#include <stdexcept>
+
 // deprecated
 size_t llama_get_state_size(struct llama_context * ctx) {
     return llama_state_get_size(ctx);
@@ -968,3 +970,8 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa
     }
 }
 
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(
+    struct llama_context * ctx
+) {
+    return ctx->model.tensors_by_name;
+}
diff --git a/src/llama-context.h b/src/llama-context.h
@@ -219,3 +219,7 @@ static void llama_output_reorder(struct llama_context * ctx) {
         out_ids.clear();
     }
 }
+
+// For internal test use
+// TODO: remove
+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
diff --git a/src/llama-grammar.cpp b/src/llama-grammar.cpp
@@ -1,5 +1,6 @@
 #include "llama-grammar.h"
 
+#include "llama-impl.h"
 #include "llama-vocab.h"
 #include "llama-sampling.h"
 
diff --git a/src/llama-grammar.h b/src/llama-grammar.h
@@ -1,8 +1,10 @@
 #pragma once
 
-#include "llama-impl.h"
+#include "llama.h"
 
 #include <map>
+#include <string>
+#include <vector>
 
 struct llama_vocab;
 
diff --git a/src/llama-impl.cpp b/src/llama-impl.cpp
@@ -0,0 +1,74 @@
+#include "llama-impl.h"
+
+#include "llama.h"
+
+#include <cstdarg>
+
+struct llama_logger_state {
+    ggml_log_callback log_callback = llama_log_callback_default;
+    void * log_callback_user_data = nullptr;
+};
+
+static llama_logger_state g_logger_state;
+
+time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : ggml_time_us()), t_acc(t_acc) {}
+
+time_meas::~time_meas() {
+        if (t_start_us >= 0) {
+            t_acc += ggml_time_us() - t_start_us;
+        }
+    }
+
+void replace_all(std::string & s, const std::string & search, const std::string & replace) {
+    if (search.empty()) {
+        return;
+    }
+    std::string builder;
+    builder.reserve(s.length());
+    size_t pos = 0;
+    size_t last_pos = 0;
+    while ((pos = s.find(search, last_pos)) != std::string::npos) {
+        builder.append(s, last_pos, pos - last_pos);
+        builder.append(replace);
+        last_pos = pos + search.length();
+    }
+    builder.append(s, last_pos, std::string::npos);
+    s = std::move(builder);
+}
+
+void llama_log_set(ggml_log_callback log_callback, void * user_data) {
+    ggml_log_set(log_callback, user_data);
+    g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
+    g_logger_state.log_callback_user_data = user_data;
+}
+
+static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
+    va_list args_copy;
+    va_copy(args_copy, args);
+    char buffer[128];
+    int len = vsnprintf(buffer, 128, format, args);
+    if (len < 128) {
+        g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
+    } else {
+        char * buffer2 = new char[len + 1];
+        vsnprintf(buffer2, len + 1, format, args_copy);
+        buffer2[len] = 0;
+        g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
+        delete[] buffer2;
+    }
+    va_end(args_copy);
+}
+
+void llama_log_internal(ggml_log_level level, const char * format, ...) {
+    va_list args;
+    va_start(args, format);
+    llama_log_internal_v(level, format, args);
+    va_end(args);
+}
+
+void llama_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+    (void) level;
+    (void) user_data;
+    fputs(text, stderr);
+    fflush(stderr);
+}
diff --git a/src/llama-impl.h b/src/llama-impl.h
diff --git a/src/llama-mmap.cpp b/src/llama-mmap.cpp
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
diff --git a/src/llama-sampling.cpp b/src/llama-sampling.cpp
diff --git a/src/llama.cpp b/src/llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,7 @@`
`1`	`1`	`#include "llama-context.h"`
`2`	`2`
	`3`	`+#include <stdexcept>`
	`4`	`+`
`3`	`5`	`// deprecated`
`4`	`6`	`size_t llama_get_state_size(struct llama_context * ctx) {`
`5`	`7`	`return llama_state_get_size(ctx);`
`@@ -968,3 +970,8 @@ size_t llama_state_seq_load_file(struct llama_context * ctx, const char * filepa`
`968`	`970`	`}`
`969`	`971`	`}`
`970`	`972`
	`973`	`+const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal_get_tensor_map(`
	`974`	`+ struct llama_context * ctx`
	`975`	`+) {`
	`976`	`+ return ctx->model.tensors_by_name;`
	`977`	`+}`
Original file line number	Diff line number	Diff line change
`@@ -219,3 +219,7 @@ static void llama_output_reorder(struct llama_context * ctx) {`
`219`	`219`	`out_ids.clear();`
`220`	`220`	`}`
`221`	`221`	`}`
	`222`	`+`
	`223`	`+// For internal test use`
	`224`	`+// TODO: remove`
	`225`	`+const std::vector<std::pair<std::string, struct ggml_tensor >> & llama_internal_get_tensor_map(struct llama_context ctx);`