Add per token attrib enum

jaime-m-p · jaime-m-p · commit cec6a3bde95a · 2024-06-01T19:42:21.000+02:00
diff --git a/llama.cpp b/llama.cpp
@@ -2147,14 +2147,16 @@ struct llama_control_vector {
 };
 
 struct llama_vocab {
-    using id    = int32_t;
-    using token = std::string;
-    using ttype = llama_token_type;
+    using id      = int32_t;
+    using token   = std::string;
+    using ttype   = llama_token_type;
+    using tattrib = llama_token_attrib;
 
     struct token_data {
-        token text;
-        float score;
-        ttype type;
+        token   text;
+        float   score;
+        ttype   type;
+        tattrib attribs;
     };
 
     enum llama_vocab_type     type     = LLAMA_VOCAB_TYPE_SPM;
@@ -4865,6 +4867,24 @@ static void llm_load_vocab(
 
         LLAMA_LOG_INFO("%s: token to piece cache size = %.4f MB\n", __func__, size_cache / 1024.0 / 1024.0);
     }
+
+    // Handle per token attributes
+    //NOTE: Each model customizes per token attributes.
+    //NOTE: Per token attributes are missing from the GGUF file.
+    //TODO: Merge llama_token_type and llama_token_attrib.
+    {
+        // convert token type as an attribute
+        for (auto data : vocab.id_to_token) {
+            uint32_t attrib = LLAMA_TOKEN_ATTRIB_UNDEFINED;
+            attrib |= LLAMA_TOKEN_ATTRIB_UNKNOWN      * (data.type == LLAMA_TOKEN_TYPE_UNKNOWN);
+            attrib |= LLAMA_TOKEN_ATTRIB_UNUSED       * (data.type == LLAMA_TOKEN_TYPE_UNUSED);
+            attrib |= LLAMA_TOKEN_ATTRIB_NORMAL       * (data.type == LLAMA_TOKEN_TYPE_NORMAL);
+            attrib |= LLAMA_TOKEN_ATTRIB_CONTROL      * (data.type == LLAMA_TOKEN_TYPE_CONTROL);
+            attrib |= LLAMA_TOKEN_ATTRIB_USER_DEFINED * (data.type == LLAMA_TOKEN_TYPE_USER_DEFINED);
+            attrib |= LLAMA_TOKEN_ATTRIB_BYTE         * (data.type == LLAMA_TOKEN_TYPE_BYTE);
+            data.attribs = (llama_token_attrib) attrib;
+        }
+    }
 }
 
 static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
diff --git a/llama.h b/llama.h
@@ -107,6 +107,20 @@ extern "C" {
         LLAMA_TOKEN_TYPE_BYTE         = 6,
     };
 
+    enum llama_token_attrib {
+        LLAMA_TOKEN_ATTRIB_UNDEFINED    = 0,
+        LLAMA_TOKEN_ATTRIB_UNKNOWN      = 1 <<  1,
+        LLAMA_TOKEN_ATTRIB_UNUSED       = 1 <<  2,
+        LLAMA_TOKEN_ATTRIB_NORMAL       = 1 <<  3,
+        LLAMA_TOKEN_ATTRIB_CONTROL      = 1 <<  4,  // SPECIAL?
+        LLAMA_TOKEN_ATTRIB_USER_DEFINED = 1 <<  5,
+        LLAMA_TOKEN_ATTRIB_BYTE         = 1 <<  6,
+        LLAMA_TOKEN_ATTRIB_NORMALIZED   = 1 <<  7,
+        LLAMA_TOKEN_ATTRIB_LSTRIP       = 1 <<  8,
+        LLAMA_TOKEN_ATTRIB_RSTRIP       = 1 <<  9,
+        LLAMA_TOKEN_ATTRIB_SINGLE_WORD  = 1 << 10,
+    };
+
     // model file types
     enum llama_ftype {
         LLAMA_FTYPE_ALL_F32              = 0,