ngram: uint64_t struct with 4x int32_t

JohannesGaessler · JohannesGaessler · commit bc7a3c08589a · 2024-03-17T23:16:35.000+01:00
diff --git a/common/common.cpp b/common/common.cpp
@@ -1879,12 +1879,7 @@ void llama_ngram_cache_update(llama_ngram_cache & ngram_cache, int ngram_min, in
         const int i_start = std::max(inp_size - nnew, ngram_size);
         for (int i = i_start; i < inp_size; ++i) {
             const int ngram_start = i - ngram_size;
-            llama_ngram ngram = inp[ngram_start];
-            for (int j = ngram_start+1; j < ngram_start + ngram_size; ++j) { // FIXME
-                const llama_ngram ngram_part = inp[j];
-                ngram <<= 16;
-                ngram |= ngram_part;
-            }
+            llama_ngram ngram(&inp[ngram_start], ngram_size);
             const llama_token token = inp[i];
 
             llama_ngram_cache::iterator part_it = ngram_cache.find(ngram);
@@ -2019,11 +2014,9 @@ void llama_ngram_cache_draft(
         llama_token drafted_token = -1;
 
         const int ngram_start_static = inp_size-LLAMA_NGRAM_STATIC + draft.size()-1;
-        llama_ngram ngram_static = get_token(inp, draft, ngram_start_static);
-        for (int j = ngram_start_static+1; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
-            const llama_ngram token = get_token(inp, draft, j);
-            ngram_static <<= 16;
-            ngram_static |= token;
+        llama_ngram ngram_static;
+        for (int j = ngram_start_static; j < ngram_start_static + LLAMA_NGRAM_STATIC; ++j) {
+            ngram_static.tokens[j-ngram_start_static] = get_token(inp, draft, j);
         }
         llama_ngram_cache::iterator part_static_it = nc_static.find(ngram_static);
         llama_ngram_cache_part part_static;
@@ -2035,11 +2028,9 @@ void llama_ngram_cache_draft(
         std::vector<llama_ngram> ngrams_cd;
         for (int ngram_size_cd = ngram_min; ngram_size_cd <= ngram_max; ++ngram_size_cd) {
             const int ngram_start_cd = inp_size-ngram_size_cd + draft.size()-1;
-            llama_ngram ngram_cd = get_token(inp, draft, ngram_start_cd);
-            for (int j = ngram_start_cd+1; j < ngram_start_cd + ngram_size_cd; ++j) {
-                const llama_ngram token = get_token(inp, draft, j);
-                ngram_cd <<= 16;
-                ngram_cd |= token;
+            llama_ngram ngram_cd;
+            for (int j = ngram_start_cd; j < ngram_start_cd + ngram_size_cd; ++j) {
+                ngram_cd.tokens[j-ngram_start_cd] = get_token(inp, draft, j);
             }
             ngrams_cd.push_back(ngram_cd);
         }
diff --git a/common/common.h b/common/common.h
@@ -268,11 +268,46 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size = 40
 #define LLAMA_NGRAM_STATIC 2
 
 // Data structures to map n-grams to empirical token probabilities:
-typedef uint64_t                                                llama_ngram;            // Each of the 4 16 bit sections represents a token id.
-typedef std::unordered_map<llama_token, int32_t>                llama_ngram_cache_part; // token -> number of times token has been seen
-typedef std::unordered_map<llama_ngram, llama_ngram_cache_part> llama_ngram_cache;      // n-gram -> empirical distribution of following tokens
 
-static_assert(LLAMA_NGRAM_MAX <= sizeof(llama_ngram)/2, "A 64 bit integer can only hold information for 4 16 bit tokens.");
+struct llama_ngram {
+    llama_token tokens[LLAMA_NGRAM_MAX];
+
+    llama_ngram() {
+        memset(tokens, 0, sizeof(tokens));
+    }
+
+    llama_ngram(const llama_token * input, const int ngram_size) {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            tokens[i] = i < ngram_size ? input[i] : 0;
+        }
+    }
+
+    bool operator==(const llama_ngram & other) const {
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            if (tokens[i] != other.tokens[i]) {
+                return false;
+            }
+        }
+        return true;
+    }
+};
+
+struct llama_ngram_hash_function {
+    size_t operator()(const llama_ngram & ngram) const {
+        size_t hash = 0;
+        for (int i = 0; i < LLAMA_NGRAM_MAX; ++i) {
+            hash ^= std::hash<llama_token>{}(ngram.tokens[i]);
+        }
+        return hash;
+    }
+};
+
+// token -> number of times token has been seen
+typedef std::unordered_map<llama_token, int32_t> llama_ngram_cache_part;
+
+// n-gram -> empirical distribution of following tokens
+typedef std::unordered_map<llama_ngram, llama_ngram_cache_part, llama_ngram_hash_function> llama_ngram_cache;
+
 
 // Update an ngram cache with tokens.
 // ngram_cache:         the cache to modify.