lookup: use static data from text corpus

JohannesGaessler · JohannesGaessler · commit c8287f64b965 · 2024-02-13T13:33:29.000+01:00
diff --git a/Makefile b/Makefile
@@ -744,6 +744,8 @@ lookahead: examples/lookahead/lookahead.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS
 lookup: examples/lookup/lookup.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) -c examples/lookup/lookup-create.cpp -o $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp)
+	$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, examples/lookup/lookup-create.cpp) -o lookup-create $(LDFLAGS)
 
 passkey: examples/passkey/passkey.cpp ggml.o llama.o $(COMMON_DEPS) $(OBJS)
 	$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
diff --git a/examples/lookup/lookup-create.cpp b/examples/lookup/lookup-create.cpp
@@ -0,0 +1,119 @@
+#include "common.h"
+#include "ggml.h"
+#include "llama.h"
+
+#include <cstdint>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+typedef std::unordered_map<llama_token, int32_t>        token_hashmap; // token -> number of times token has been seen
+typedef std::unordered_map<uint64_t, token_hashmap> all_token_hashmap; // n-gram -> empirical distribution of following tokens
+constexpr int ngram_size = 2;
+
+int main(int argc, char ** argv){
+    gpt_params params;
+
+    if (!gpt_params_parse(argc, argv, params)) {
+        return 1;
+    }
+    // init llama.cpp
+    llama_backend_init(params.numa);
+
+    llama_model * model = NULL;
+    llama_context * ctx = NULL;
+
+    // load the model
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    GGML_ASSERT(model != nullptr);
+
+    // tokenize the prompt
+    const bool add_bos = llama_should_add_bos_token(model);
+
+    const char * static_input_file = "./wikitext-2-raw/wiki.train.raw";
+    std::ifstream file(static_input_file);
+    if (!file) {
+        fprintf(stderr, "error: failed to open file '%s'\n", static_input_file);
+        exit(1);
+    }
+    std::string static_input;
+    std::copy(std::istreambuf_iterator<char>(file), std::istreambuf_iterator<char>(), back_inserter(static_input));
+    if (!static_input.empty() && static_input.back() == '\n') {
+        static_input.pop_back();
+    }
+    std::vector<llama_token> inp_static;
+    inp_static = ::llama_tokenize(ctx, static_input, add_bos, true);
+    fprintf(stderr, "lookup-create: tokenization done\n");
+
+    auto update_hashmaps = [](all_token_hashmap * atc, const llama_token * inp_data, const int inp_size, const int nnew) -> void {
+        // atcs = all_token_counts: the hashmaps to modify.
+        // inp_data: the token sequence on which the hashmaps are based.
+        // inp_size: the current size of inp_data.
+        // nnew: how many new tokens have been appended to inp_data since the last call to this function.
+        //
+        // In order to get correct results inp_data can ONLY BE APPENDED TO.
+        // Changes in the middle need a complete rebuild.
+
+        const int     i_start    = std::max(inp_size - nnew, ngram_size);
+        const int64_t t_start_ms = ggml_time_ms();
+        int percentage_done = 0;
+        for (int i = i_start; i < inp_size; ++i) {
+            const int ngram_start = i - ngram_size;
+            uint64_t ngram = inp_data[ngram_start];
+            for (int j = ngram_start; j < ngram_start + ngram_size; ++j) {
+                const uint64_t ngram_part = inp_data[j];
+                ngram <<= 16;
+                ngram |= ngram_part;
+            }
+            const llama_token token = inp_data[i];
+
+            all_token_hashmap::iterator token_counts_it = atc->find(ngram);
+            if (token_counts_it == atc->end()) {
+                token_hashmap token_counts;
+                token_counts.emplace(token, 1);
+                atc->emplace(ngram, token_counts);
+            } else {
+                token_hashmap::iterator tc_it = token_counts_it->second.find(token);
+                if (tc_it == token_counts_it->second.end()) {
+                    token_counts_it->second.emplace(token, 1);
+                } else {
+                    tc_it->second++;
+                }
+            }
+
+            if (i >= inp_size*(percentage_done + 1)/100) {
+                ++percentage_done;
+
+                const int64_t t_now_ms = ggml_time_ms();
+                const int64_t eta_ms   = (100 - percentage_done) * (t_now_ms - t_start_ms) / percentage_done;
+                const int64_t eta_min  = eta_ms / (60*1000);
+                const int64_t eta_s    = (eta_ms - eta_min) / 1000;
+
+                fprintf(stderr, "lookup-create: %02d%% done, ETA: %02ld:%02ld\n", percentage_done, eta_min, eta_s);
+            }
+        }
+    };
+
+    all_token_hashmap atc;
+    update_hashmaps(&atc, inp_static.data(), inp_static.size(), inp_static.size());
+
+    std::ofstream file_out("lookup.bin", std::ios::binary);
+    for (std::pair<uint64_t, token_hashmap> item : atc) {
+        const uint64_t ngram        = item.first;
+        token_hashmap  token_counts = item.second;
+        GGML_ASSERT(!token_counts.empty());
+        const int32_t ntokens = token_counts.size();
+
+
+        file_out.write(reinterpret_cast<const char *>(&ngram),   sizeof(uint64_t));
+        file_out.write(reinterpret_cast<const char *>(&ntokens), sizeof(int32_t));
+        for (std::pair<llama_token, int32_t> item2 : token_counts) {
+            const llama_token token = item2.first;
+            const int32_t     count = item2.second;
+            file_out.write(reinterpret_cast<const char *>(&token), sizeof(llama_token));
+            file_out.write(reinterpret_cast<const char *>(&count), sizeof(int32_t));
+        }
+    }
+}
diff --git a/examples/lookup/lookup.cpp b/examples/lookup/lookup.cpp
@@ -5,12 +5,13 @@
 #include <cmath>
 #include <cstdint>
 #include <cstdio>
+#include <fstream>
 #include <string>
 #include <vector>
 #include <unordered_map>
 
 // Data structures to map n-grams to empirical token probabilities:
-typedef std::unordered_map<llama_token, int>            token_hashmap; // token -> number of times token has been seen
+typedef std::unordered_map<llama_token, int32_t>        token_hashmap; // token -> number of times token has been seen
 typedef std::unordered_map<uint64_t, token_hashmap> all_token_hashmap; // n-gram -> empirical distribution of following tokens
 // n-grams are encoded as 64 bit integers with each of the 4 16 bit sections representing a token id.
 // This way no custom hashing function for the n-grams is needed.
@@ -22,7 +23,7 @@ static_assert(ngram_max <= sizeof(uint64_t)/2, "A 64 bit integer can only hold i
 
 // If sample size or percentage in context are below these thresholds the draft is aborted early:
 constexpr float draft_min_sample_size[ngram_max] = { 2,  2,  1,  1};
-constexpr float     draft_min_percent[ngram_max] = {66, 50, 50, 50};
+constexpr float     draft_min_percent[ngram_max] = {50, 50, 50, 50};
 
 int main(int argc, char ** argv){
     gpt_params params;
@@ -100,12 +101,43 @@ int main(int argc, char ** argv){
     };
 
     all_token_hashmap all_token_counts[ngram_max-ngram_min+1];
+    all_token_hashmap static_all_token_counts;
     int64_t t_draft_us = 0;
 
     {
         // Fill up hashmaps with tokens from user input:
         const int64_t t_start_draft_us = ggml_time_us();
         update_hashmaps(all_token_counts, inp.data(), inp.size(), inp.size());
+
+        const char * hashmap_file_name = "lookup.bin";
+        std::ifstream hashmap_file(hashmap_file_name, std::ios::binary);
+        if (!hashmap_file) {
+            fprintf(stderr, "error: failed to open file '%s'\n", hashmap_file_name);
+            exit(1);
+        }
+        uint64_t ngram;
+        int32_t ntokens;
+        llama_token token;
+        int32_t count;
+
+        char * ngramc   = reinterpret_cast<char*>(&ngram);
+        char * ntokensc = reinterpret_cast<char*>(&ntokens);
+        char * tokenc   = reinterpret_cast<char*>(&token);
+        char * countc   = reinterpret_cast<char*>(&count);
+        while(hashmap_file.read(ngramc, sizeof(uint64_t))) {
+            GGML_ASSERT(hashmap_file.read(ntokensc, sizeof(int32_t)));
+            token_hashmap token_counts;
+
+            for (int i = 0; i < ntokens; ++i) {
+                GGML_ASSERT(hashmap_file.read(tokenc, sizeof(llama_token)));
+                GGML_ASSERT(hashmap_file.read(countc, sizeof(int32_t)));
+                token_counts.emplace(token, count);
+            }
+
+            static_all_token_counts.emplace(ngram, token_counts);
+        }
+        GGML_ASSERT(hashmap_file.eof());
+
         t_draft_us += ggml_time_us() - t_start_draft_us;
     }
 
@@ -248,6 +280,20 @@ int main(int argc, char ** argv){
 
             while ((int) draft.size()-1 < n_draft) {
                 bool draft_success = false;
+
+                const int static_ngram_start = inp_size-2 + draft.size()-1;
+                uint64_t static_ngram = get_token(inp, draft, static_ngram_start);
+                for (int j = static_ngram_start; j < static_ngram_start + 2; ++j) {
+                    const uint64_t ngram_part = get_token(inp, draft, j);
+                    static_ngram <<= 16;
+                    static_ngram |= ngram_part;
+                }
+                all_token_hashmap::iterator static_token_counts_it = static_all_token_counts.find(static_ngram);
+                token_hashmap static_token_counts;
+                if (static_token_counts_it != static_all_token_counts.end()) {
+                    static_token_counts = static_token_counts_it->second;
+                }
+
                 for (int ngram_size = ngram_max; ngram_size >= ngram_min; --ngram_size) {
                     if (ngram_size > inp_size) {
                         continue;
@@ -270,16 +316,21 @@ int main(int argc, char ** argv){
                     const token_hashmap token_counts = token_counts_it->second;
 
                     int max_count = 0;
+                    int max_count_static = 0;
                     int sum_count = 0;
                     llama_token max_token = -1;
 
                     for (std::pair<llama_token, int> tc : token_counts) {
                         const llama_token token = tc.first;
-                        const llama_token count = tc.second;
 
-                        if (count > max_count) {
-                            max_token = token;
-                            max_count = count;
+                        token_hashmap::iterator stc_it = static_token_counts.find(token);
+                        const int32_t count        = tc.second;
+                        const int32_t count_static = stc_it != static_token_counts.end() ? 100*stc_it->second : 1;
+
+                        if (count*count_static > max_count*max_count_static) {
+                            max_token        = token;
+                            max_count        = count;
+                            max_count_static = count_static;
                         }
                         sum_count += count;
                     }
@@ -299,6 +350,38 @@ int main(int argc, char ** argv){
                     break;
                 }
 
+                if (!draft_success) {
+                    int max_count = 0;
+                    int sum_count = 0;
+                    llama_token max_token = -1;
+
+                    for (std::pair<llama_token, int> tc : static_token_counts) {
+                        const llama_token token = tc.first;
+                        const int32_t     count = tc.second;
+
+                        if (count > max_count) {
+                            max_token        = token;
+                            max_count        = count;
+                        }
+                        sum_count += count;
+                    }
+
+                    // Skip this candidate if the sample size is too low:
+                    if (sum_count < draft_min_sample_size[2-1]) {
+                        break;
+                    }
+                    // skip this candidate if the empirically most likely token following this token is not likely enough:
+                    if (100*max_count < draft_min_percent[2-1]*sum_count) {
+                        break;
+                    }
+
+                    LOG(" - draft candidate: token=%d count=%d\n", max_token, max_count);
+                    llama_batch_add(batch_tgt, max_token, n_past + draft.size(), { 0 }, true);
+                    draft.push_back(max_token);
+                    draft_success = true;
+                    break;
+                }
+
                 if (!draft_success) {
                     break;
                 }
diff --git a/scripts/get-wikitext-103.sh b/scripts/get-wikitext-103.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+wget https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip
+
+echo "Usage:"
+echo ""
+echo "  ./perplexity -m model.gguf -f wiki.test.raw [other params]"
+echo ""
+
+exit 0