ggml-org
diff --git a/‎common/common.cpp
Lines changed: 11 additions & 4 deletions b/‎common/common.cpp
Lines changed: 11 additions & 4 deletions
diff --git a/‎common/common.h
Lines changed: 1 addition & 0 deletions b/‎common/common.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎common/console.cpp
Lines changed: 0 additions & 1 deletion b/‎common/console.cpp
Lines changed: 0 additions & 1 deletion
diff --git a/‎common/log.h
Lines changed: 16 additions & 16 deletions b/‎common/log.h
Lines changed: 16 additions & 16 deletions
diff --git a/‎examples/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎examples/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/baby-llama/baby-llama.cpp
Lines changed: 5 additions & 0 deletions b/‎examples/baby-llama/baby-llama.cpp
Lines changed: 5 additions & 0 deletions
diff --git a/‎examples/beam-search/beam-search.cpp
Lines changed: 3 additions & 5 deletions b/‎examples/beam-search/beam-search.cpp
Lines changed: 3 additions & 5 deletions
diff --git a/‎examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Lines changed: 35 additions & 38 deletions b/‎examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp
Lines changed: 35 additions & 38 deletions
@@ -24,9 +24,7 @@
 
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
+#define NOMINMAX
 #include <codecvt>
 #include <locale>
 #include <windows.h>
@@ -317,6 +315,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.model = argv[i];
+        } else if (arg == "-md" || arg == "--model-draft") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_draft = argv[i];
         } else if (arg == "-a" || arg == "--alias") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -669,6 +673,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --lora-base FNAME     optional model to use as a base for the layers modified by the LoRA adapter\n");
     fprintf(stdout, "  -m FNAME, --model FNAME\n");
     fprintf(stdout, "                        model path (default: %s)\n", params.model.c_str());
+    fprintf(stdout, "  -md FNAME, --model-draft FNAME\n");
+    fprintf(stdout, "                        draft model for speculative sampling (default: %s)\n", params.model.c_str());
     fprintf(stdout, "  -ld LOGDIR, --logdir LOGDIR\n");
     fprintf(stdout, "                        path under which to save YAML logs (no logging if unset)\n");
     fprintf(stdout, "\n");
@@ -1029,7 +1035,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     dump_string_yaml_multiline(stream, "grammar", params.grammar.c_str());
     fprintf(stream, "grammar-file: # never logged, see grammar instead. Can still be specified for input.\n");
     fprintf(stream, "hellaswag: %s # default: false\n", params.hellaswag ? "true" : "false");
-    fprintf(stream, "hellaswag_tasks: %zu # default: 400\n", params.hellaswag_tasks);
+    fprintf(stream, "hellaswag_tasks: %ld # default: 400\n", params.hellaswag_tasks);
 
     const auto logit_bias_eos = params.logit_bias.find(llama_token_eos(lctx));
     const bool ignore_eos = logit_bias_eos != params.logit_bias.end() && logit_bias_eos->second == -INFINITY;
@@ -1062,6 +1068,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "mirostat_lr: %f # default: 0.1\n", params.mirostat_eta);
     fprintf(stream, "mlock: %s # default: false\n", params.use_mlock ? "true" : "false");
     fprintf(stream, "model: %s # default: models/7B/ggml-model.bin\n", params.model.c_str());
+    fprintf(stream, "model_draft: %s # default:\n", params.model_draft.c_str());
     fprintf(stream, "mtest: %s # default: false\n", params.mem_test ? "true" : "false");
     fprintf(stream, "multiline_input: %s # default: false\n", params.multiline_input ? "true" : "false");
     fprintf(stream, "n_gpu_layers: %d # default: 0\n", params.n_gpu_layers);
 
@@ -63,6 +63,7 @@ struct gpt_params {
     float       cfg_scale         = 1.f;   // How strong is guidance
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model_draft       = "";                              // draft model for speculative sampling
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
     std::string path_prompt_cache = "";  // path to file for saving/loading prompt eval state
 
@@ -235,7 +235,6 @@ namespace console {
 
     int estimateWidth(char32_t codepoint) {
 #if defined(_WIN32)
-        (void)codepoint;
         return 1;
 #else
         return wcwidth(codepoint);
 
@@ -154,7 +154,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
 //  #include "log.h"
 //
 #ifndef LOG_NO_TIMESTAMPS
-    #ifndef _MSC_VER
+    #ifndef _WIN32
         #define LOG_TIMESTAMP_FMT "[%" PRIu64 "] "
         #define LOG_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
     #else
@@ -167,7 +167,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
 #endif
 
 #ifdef LOG_TEE_TIMESTAMPS
-    #ifndef _MSC_VER
+    #ifndef _WIN32
         #define LOG_TEE_TIMESTAMP_FMT "[%" PRIu64 "] "
         #define LOG_TEE_TIMESTAMP_VAL , (std::chrono::duration_cast<std::chrono::duration<std::uint64_t>>(std::chrono::system_clock::now().time_since_epoch())).count()
     #else
@@ -187,7 +187,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
 //  #include "log.h"
 //
 #ifndef LOG_NO_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
+    #ifndef _WIN32
         #define LOG_FLF_FMT "[%24s:%5d][%24s] "
         #define LOG_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
     #else
@@ -200,7 +200,7 @@ inline std::string log_filename_generator_impl(const std::string & log_file_base
 #endif
 
 #ifdef LOG_TEE_FILE_LINE_FUNCTION
-    #ifndef _MSC_VER
+    #ifndef _WIN32
         #define LOG_TEE_FLF_FMT "[%24s:%5d][%24s] "
         #define LOG_TEE_FLF_VAL , __FILE__, __LINE__, __FUNCTION__
     #else
@@ -224,7 +224,7 @@ enum LogTriState
 // INTERNAL, DO NOT USE
 //  USE LOG() INSTEAD
 //
-#ifndef _MSC_VER
+#ifndef _WIN32
     #define LOG_IMPL(str, ...)                                                                                          \
     {                                                                                                               \
         if (LOG_TARGET != nullptr)                                                                                  \
@@ -247,7 +247,7 @@ enum LogTriState
 // INTERNAL, DO NOT USE
 //  USE LOG_TEE() INSTEAD
 //
-#ifndef _MSC_VER
+#ifndef _WIN32
     #define LOG_TEE_IMPL(str, ...)                                                                                                          \
     {                                                                                                                                   \
         if (LOG_TARGET != nullptr)                                                                                                      \
@@ -284,7 +284,7 @@ enum LogTriState
 // Main LOG macro.
 //  behaves like printf, and supports arguments the exact same way.
 //
-#ifndef _MSC_VER
+#ifndef _WIN32
     #define LOG(...) LOG_IMPL(__VA_ARGS__, "")
 #else
     #define LOG(str, ...) LOG_IMPL("%s" str, "", __VA_ARGS__, "")
@@ -298,14 +298,14 @@ enum LogTriState
 // Secondary target can be changed just like LOG_TARGET
 //  by defining LOG_TEE_TARGET
 //
-#ifndef _MSC_VER
+#ifndef _WIN32
     #define LOG_TEE(...) LOG_TEE_IMPL(__VA_ARGS__, "")
 #else
     #define LOG_TEE(str, ...) LOG_TEE_IMPL("%s" str, "", __VA_ARGS__, "")
 #endif
 
 // LOG macro variants with auto endline.
-#ifndef _MSC_VER
+#ifndef _WIN32
     #define LOGLN(...) LOG_IMPL(__VA_ARGS__, "\n")
     #define LOG_TEELN(...) LOG_TEE_IMPL(__VA_ARGS__, "\n")
 #else
@@ -341,14 +341,14 @@ inline FILE *log_handler1_impl(bool change = false, LogTriState disable = LogTri
         }
     }
 
-    if (_disabled)
-    {
-        // Log is disabled
-        return nullptr;
-    }
-
     if (_initialized)
     {
+        if (_disabled)
+        {
+            // Log is disabled
+            return nullptr;
+        }
+
         // with fallback in case something went wrong
         return logfile ? logfile : stderr;
     }
@@ -461,7 +461,7 @@ inline void log_test()
     LOG("13 Hello World this time in yet new file?\n")
     log_set_target(log_filename_generator("llama_autonamed", "log"));
     LOG("14 Hello World in log with generated filename!\n")
-#ifdef _MSC_VER
+#ifdef _WIN32
     LOG_TEE("15 Hello msvc TEE without arguments\n")
     LOG_TEE("16 Hello msvc TEE with (%d)(%s) arguments\n", 1, "test")
     LOG_TEELN("17 Hello msvc TEELN without arguments\n")
 
@@ -23,6 +23,7 @@ else()
     add_subdirectory(train-text-from-scratch)
     add_subdirectory(convert-llama2c-to-ggml)
     add_subdirectory(simple)
+    add_subdirectory(speculative)
     add_subdirectory(embd-input)
     add_subdirectory(llama-bench)
     add_subdirectory(beam-search)
 
@@ -1617,10 +1617,15 @@ int main(int argc, char ** argv) {
 
         float error_before_opt = ggml_get_f32_1d(e, 0);
 
+        struct ggml_opt_params opt_params_adam = ggml_opt_default_params(GGML_OPT_ADAM);
         struct ggml_opt_params opt_params_lbfgs = ggml_opt_default_params(GGML_OPT_LBFGS);
+        opt_params_adam.print_forward_graph = false;
+        opt_params_adam.print_backward_graph = false;
         opt_params_lbfgs.print_forward_graph = false;
         opt_params_lbfgs.print_backward_graph = false;
+        opt_params_adam.adam.n_iter = 16;
         opt_params_lbfgs.lbfgs.n_iter = 16;
+        // ggml_opt(ctx0, opt_params_adam, e);
         ggml_opt(ctx0, opt_params_lbfgs, e);
         //
         ggml_build_forward_expand(&gf, e);
 
@@ -22,9 +22,7 @@
 #include <unistd.h>
 #elif defined (_WIN32)
 #define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#   define NOMINMAX
-#endif
+#define NOMINMAX
 #include <windows.h>
 #include <signal.h>
 #endif
@@ -75,7 +73,7 @@ void beam_search_callback(void * callback_data_ptr, llama_beams_state beams_stat
         assert(0u < beams_state.n_beams);
         const llama_token * tokens = beams_state.beam_views[0].tokens;
         std::copy(tokens, tokens + n, callback_data.response.end() - n);
-        printf("%zu", n);
+        printf("%lu", n);
     }
     fflush(stdout);
 #if 1 // DEBUG: print current beams for this iteration
@@ -147,7 +145,7 @@ int main(int argc, char ** argv)
 
     if (tokens_list.size() > max_tokens_list_size)
     {
-        fprintf( stderr , "%s: error: prompt too long (%zu tokens, max %zu)\n" ,
+        fprintf( stderr , "%s: error: prompt too long (%lu tokens, max %lu)\n" ,
              __func__ , tokens_list.size() , max_tokens_list_size );
         return 1;
     }
 
@@ -75,7 +75,7 @@ typedef struct {
     int seq_len; // max sequence length
 } Config;
 
-struct TransformerWeights {
+typedef struct {
     // token embedding table
     float* token_embedding_table;    // (vocab_size, dim)
     // weights for rmsnorms
@@ -97,22 +97,7 @@ struct TransformerWeights {
     // float* freq_cis_imag; // (seq_len, dim/2)
     // (optional) classifier weights for the logits, on the last layer
     float* wcls;
-
-    ~TransformerWeights() {
-        delete[] token_embedding_table;
-        delete[] rms_att_weight;
-        delete[] rms_ffn_weight;
-        delete[] wq;
-        delete[] wk;
-        delete[] wv;
-        delete[] wo;
-        delete[] w1;
-        delete[] w2;
-        delete[] w3;
-        delete[] rms_final_weight;
-        delete[] wcls;
-    }
-};
+} TransformerWeights;
 
 void malloc_weights(TransformerWeights* w, Config* p, bool shared_weights) {
     // we calloc instead of malloc to keep valgrind happy
@@ -188,6 +173,21 @@ int checkpoint_init_weights(TransformerWeights *w, Config* p, FILE* f, bool shar
     return 0;
 }
 
+void free_weights(TransformerWeights* w) {
+    delete w->token_embedding_table;
+    delete w->rms_att_weight;
+    delete w->rms_ffn_weight;
+    delete w->wq;
+    delete w->wk;
+    delete w->wv;
+    delete w->wo;
+    delete w->w1;
+    delete w->w2;
+    delete w->w3;
+    delete w->rms_final_weight;
+    if (w->wcls) delete w->wcls;
+}
+
 void print_sample_weights(TransformerWeights *w){
     printf("----- Quick print of first of the weight vales of all the variables\n");
     printf("%f\n", w->token_embedding_table[0]);
@@ -596,10 +596,6 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
         // assume llama2.c vocabulary
         printf("Assuming llama2.c vocabulary since %s is not a gguf file\n", filename);
         llama_file file(filename, "rb");
-        if (!file.fp) {
-            fprintf(stderr, "error: %s: %s\n", strerror(errno), filename);
-            exit(1);
-        }
         const int  n_vocab = config->vocab_size;
         /* uint32_t max_token_length =  */ file.read_u32(); // unused
         vocab->id_to_token.resize(n_vocab);
@@ -637,7 +633,7 @@ void load_vocab(const char *filename, Config *config, struct llama_vocab *vocab)
     }
 }
 
-void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * karpathy_weights) {
+void stuff_karpathy_weights_into_gg(struct ggml_tensor * gg_weights, float * karpathy_weights){
     int ct;
     switch (gg_weights->n_dims){
         case 1:
@@ -674,13 +670,13 @@ void convert_weights_ak_to_gg(struct ggml_tensor * gg_weights, const float * kar
 }
 
 void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * model, TransformerWeights* w, const char * filename) {
-    // convert AK weights into GG weights one by one.
+    // stuff AK weights into GG weights one by one.
     // w->token_embedding_table -> model->tok_embeddings
     // float*                   -> struct ggml_tensor
-    convert_weights_ak_to_gg(model->tok_embeddings, w->token_embedding_table);
-    convert_weights_ak_to_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->tok_embeddings, w->token_embedding_table);
+    stuff_karpathy_weights_into_gg(model->output, w->wcls ? w->wcls : w->token_embedding_table);
 
-    convert_weights_ak_to_gg(model->norm, w->rms_final_weight);
+    stuff_karpathy_weights_into_gg(model->norm, w->rms_final_weight);
     //print_row(model->norm, 0);
 
     // for rms-att-weight
@@ -690,18 +686,18 @@ void save_as_llama_model(struct llama_vocab * vocab, struct my_llama_model * mod
     for (uint32_t i = 0; i < model->hparams.n_layer; ++i){
         auto & layer = model->layers[i];
         // 1d
-        convert_weights_ak_to_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
-        convert_weights_ak_to_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.attention_norm, &w->rms_att_weight[i*row_length]);
+        stuff_karpathy_weights_into_gg(layer.ffn_norm      , &w->rms_ffn_weight[i*row_length]);
 
         // from 3d matrix layer x dim x dim to 2d matrix dim x dim
-        convert_weights_ak_to_gg(layer.wq            , &w->wq[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wk            , &w->wk[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wv            , &w->wv[i*row_length*row_length]);
-        convert_weights_ak_to_gg(layer.wo            , &w->wo[i*row_length*row_length]);
-
-        convert_weights_ak_to_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
-        convert_weights_ak_to_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
-        convert_weights_ak_to_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
+        stuff_karpathy_weights_into_gg(layer.wq            , &w->wq[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wk            , &w->wk[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wv            , &w->wv[i*row_length*row_length]);
+        stuff_karpathy_weights_into_gg(layer.wo            , &w->wo[i*row_length*row_length]);
+
+        stuff_karpathy_weights_into_gg(layer.w1            , &w->w1[i*row_length*n_ff]);
+        stuff_karpathy_weights_into_gg(layer.w2            , &w->w2[i*n_ff*row_length]);
+        stuff_karpathy_weights_into_gg(layer.w3            , &w->w3[i*row_length*n_ff]);
     }
 
     struct gguf_context * ctx = gguf_init_empty();
@@ -902,7 +898,7 @@ bool params_parse(int argc, char ** argv, struct train_params * params) {
 }
 
 std::string basename(const std::string &path) {
-    size_t pos = path.find_last_of("/\\");
+    size_t pos = path.find_last_of("/");
     if (pos == std::string::npos) {
         return path;
     }
@@ -915,7 +911,7 @@ int main(int argc, char ** argv) {
         return 1;
     }
     Config config;
-    TransformerWeights weights = {};
+    TransformerWeights weights;
     {
         FILE *file = fopen(params.fn_llama2c_model, "rb");
         if (!file) { printf("Unable to open the checkpoint file %s!\n", params.fn_llama2c_model); return 1; }
@@ -957,5 +953,6 @@ int main(int argc, char ** argv) {
     printf("Saving llama.c model file %s in ggml format at %s\n", params.fn_llama2c_model, params.fn_llama2c_output_model);
 
     ggml_free(model.ctx);
+    free_weights(&weights);
     return 0;
 }