@@ -21,7 +21,7 @@ class LibLlama:
21
21
DEFAULT_PATH_LLAMA_H = "./llama.h"
22
22
DEFAULT_PATH_LIBLLAMA = "./build/libllama.so" # CMakeLists.txt: BUILD_SHARED_LIBS ON
23
23
24
- def __init__ (self , path_llama_h :str = None , path_libllama :str = None ):
24
+ def __init__ (self , path_llama_h : str = None , path_libllama : str = None ):
25
25
path_llama_h = path_llama_h or self .DEFAULT_PATH_LLAMA_H
26
26
path_libllama = path_libllama or self .DEFAULT_PATH_LIBLLAMA
27
27
(self .ffi , self .lib ) = self ._load_libllama_cffi (path_llama_h , path_libllama )
@@ -42,34 +42,35 @@ def _load_libllama_cffi(self, path_llama_h: str, path_libllama: str):
42
42
ffi .cdef (source , override = True )
43
43
lib = ffi .dlopen (path_libllama )
44
44
return (ffi , lib )
45
-
45
+
46
46
def model_default_params (self , ** kwargs ):
47
47
mparams = self .lib .llama_model_default_params ()
48
48
for k , v in kwargs .items ():
49
49
setattr (mparams , k , v )
50
50
return mparams
51
-
51
+
52
52
def context_default_params (self , ** kwargs ):
53
53
cparams = self .lib .llama_context_default_params ()
54
54
for k , v in kwargs .items ():
55
55
setattr (cparams , k , v )
56
56
return cparams
57
57
58
+
58
59
class LibLlamaModel :
59
60
60
- def __init__ (self , libllama :LibLlama , path_model :str , mparams = {}, cparams = {}):
61
+ def __init__ (self , libllama : LibLlama , path_model : str , mparams = {}, cparams = {}):
61
62
self .lib = libllama .lib
62
63
self .ffi = libllama .ffi
63
64
if type (mparams ) == dict :
64
65
mparams = libllama .model_default_params (** mparams )
65
66
self .model = self .lib .llama_load_model_from_file (path_model .encode (), mparams )
66
67
if not self .model :
67
- raise RuntimeError ("error: failed to load model '%s'" % path_model )
68
+ raise RuntimeError ("error: failed to load model '%s'" % path_model )
68
69
if type (cparams ) == dict :
69
70
cparams = libllama .context_default_params (** cparams )
70
71
self .ctx = self .lib .llama_new_context_with_model (self .model , cparams )
71
72
if not self .ctx :
72
- raise RuntimeError ("error: failed to create context for model '%s'" % path_model )
73
+ raise RuntimeError ("error: failed to create context for model '%s'" % path_model )
73
74
n_tokens_max = self .lib .llama_n_ctx (self .ctx )
74
75
self .token_ids = self .ffi .new ("llama_token[]" , n_tokens_max )
75
76
@@ -82,7 +83,7 @@ def free(self):
82
83
self .model = None
83
84
self .lib = None
84
85
85
- def tokenize (self , text :str , n_tokens_max :int = 0 , add_special :bool = False , parse_special :bool = False ) -> list [int ]:
86
+ def tokenize (self , text : str , n_tokens_max : int = 0 , add_special : bool = False , parse_special : bool = False ) -> list [int ]:
86
87
n_tokens_max = n_tokens_max if n_tokens_max > 0 else len (self .token_ids )
87
88
text = text .encode ("utf-8" )
88
89
num = self .lib .llama_tokenize (self .model , text , len (text ), self .token_ids , n_tokens_max , add_special , parse_special )
@@ -91,14 +92,14 @@ def tokenize(self, text:str, n_tokens_max:int=0, add_special:bool=False, parse_s
91
92
return list (self .token_ids [0 :num ])
92
93
93
94
94
- def find_first_mismatch (ids1 :list [int ], ids2 :list [int ]):
95
+ def find_first_mismatch (ids1 : list [int ], ids2 : list [int ]):
95
96
for i , (a ,b ) in enumerate (zip (ids1 , ids2 )):
96
97
if a != b :
97
98
return i
98
99
return - 1 if len (ids1 ) == len (ids2 ) else i
99
100
100
101
101
- def test_custom_texts (model :LibLlamaModel , tokenizer :PreTrainedTokenizerBase ):
102
+ def test_custom_texts (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase ):
102
103
103
104
tests = [
104
105
"" ,
@@ -153,7 +154,7 @@ def test_custom_texts(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase):
153
154
'\uFEFF //' , # unicode_ranges_control, 0xFEFF (BOM)
154
155
]
155
156
156
- for text in tests + more_tests :
157
+ for text in tests + more_tests :
157
158
ids1 = model .tokenize (text , parse_special = True )
158
159
ids2 = tokenizer .encode (text )
159
160
logger .info (repr (text ))
@@ -164,22 +165,22 @@ def test_custom_texts(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase):
164
165
raise Exception ()
165
166
166
167
167
- def test_random_chars (model :LibLlamaModel , tokenizer :PreTrainedTokenizerBase , iterations = 100 ):
168
+ def test_random_chars (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
168
169
169
- WHITESPACES = list (" " * 20 + "\n " * 5 + "\r \n " * 5 + "\t " * 5 )
170
+ WHITESPACES = list (" " * 20 + "\n " * 5 + "\r \n " * 5 + "\t " * 5 )
170
171
CHARS = list (set ("""
171
172
ABCDEFGHIJKLMNOPQRSTUVWXYZ
172
173
abcdefghijklmnopqrstuvwxyz
173
174
ÁÉÍÓÚÀÈÌÒÙÂÊÎÔÛÄËÏÖÜ
174
175
áéíóúàèìòùâêîôûäëïöü
175
176
.-,*/-+ª!"·$%&/()=?¿[]{}<>\\ |@#~½¬~;:_
176
177
""" ))
177
-
178
+
178
179
logger .info ("Bruteforce random chars encodings ..." )
179
180
rand = random .Random ()
180
181
for m in range (iterations ):
181
182
182
- logger .debug ("%d/%d" % (m + 1 , iterations ))
183
+ logger .debug ("%d/%d" % (m + 1 , iterations ))
183
184
rand .seed (m )
184
185
185
186
text = []
@@ -188,29 +189,29 @@ def test_random_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it
188
189
k = rand .randint (1 , 7 )
189
190
word = rand .choices (CHARS , k = k )
190
191
space = rand .choice (WHITESPACES )
191
- text .append ("" .join (word )+ space )
192
+ text .append ("" .join (word ) + space )
192
193
text = "" .join (text )
193
194
194
195
ids1 = model .tokenize (text , parse_special = True )
195
196
ids2 = tokenizer .encode (text )
196
197
assert (ids1 == ids2 )
197
198
198
199
199
- def test_random_vocab_chars (model :LibLlamaModel , tokenizer :PreTrainedTokenizerBase , iterations = 100 ):
200
+ def test_random_vocab_chars (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
200
201
201
202
logger .info ("Building vocab char list ..." )
202
203
vocab_ids = list (tokenizer .vocab .values ())
203
204
vocab_text = tokenizer .decode (vocab_ids )
204
205
vocab_chars = list (set (vocab_text ))
205
206
del vocab_ids , vocab_text
206
-
207
+
207
208
logger .info ("Bruteforce random text encodings ..." )
208
209
rand = random .Random ()
209
210
for m in range (iterations ):
210
211
211
- logger .debug ("%d/%d" % (m + 1 , iterations ))
212
+ logger .debug ("%d/%d" % (m + 1 , iterations ))
212
213
rand .seed (m )
213
-
214
+
214
215
text = rand .choices (vocab_chars , k = 1024 )
215
216
text = "" .join (text )
216
217
@@ -219,7 +220,7 @@ def test_random_vocab_chars(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBa
219
220
assert (ids1 == ids2 )
220
221
221
222
222
- def test_random_vocab_tokens (model :LibLlamaModel , tokenizer :PreTrainedTokenizerBase , iterations = 100 ):
223
+ def test_random_vocab_tokens (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
223
224
224
225
logger .info ("Building token list ..." )
225
226
space_id = tokenizer .encode (" " )[0 ]
@@ -230,7 +231,7 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
230
231
vocab_tokens = tokenizer .decode (vocab_ids )
231
232
vocab_tokens = vocab_tokens .split (" " )
232
233
del vocab_ids
233
-
234
+
234
235
logger .info ("Checking single token encodings ..." )
235
236
for token in vocab_tokens :
236
237
ids1 = model .tokenize (token , parse_special = True )
@@ -241,15 +242,15 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
241
242
rand = random .Random ()
242
243
for m in range (iterations ):
243
244
244
- logger .debug ("%d/%d" % (m + 1 , iterations ))
245
+ logger .debug ("%d/%d" % (m + 1 , iterations ))
245
246
rand .seed (m )
246
-
247
+
247
248
text = []
248
249
num_words = rand .randint (300 , 400 )
249
250
for i in range (num_words ):
250
251
k = rand .randint (1 , 3 )
251
252
tokens = rand .choices (vocab_tokens , k = k )
252
- tokens = [ t .strip (" \n \r \t " ) for t in tokens ]
253
+ tokens = [t .strip (" \n \r \t " ) for t in tokens ]
253
254
sep = rand .choice (" \n \r \t " )
254
255
text .append ("" .join (tokens ) + sep )
255
256
text = "" .join (text )
@@ -259,15 +260,15 @@ def test_random_vocab_tokens(model:LibLlamaModel, tokenizer:PreTrainedTokenizerB
259
260
assert (ids1 == ids2 )
260
261
261
262
262
- def test_random_bytes (model :LibLlamaModel , tokenizer :PreTrainedTokenizerBase , iterations = 100 ):
263
+ def test_random_bytes (model : LibLlamaModel , tokenizer : PreTrainedTokenizerBase , iterations = 100 ):
263
264
264
- WHITESPACES = list (" " * 20 + "\n " * 5 + "\r \n " * 5 + "\t " * 5 )
265
+ WHITESPACES = list (" " * 20 + "\n " * 5 + "\r \n " * 5 + "\t " * 5 )
265
266
266
267
logger .info ("Bruteforce random bytes encodings ..." )
267
268
rand = random .Random ()
268
269
for m in range (iterations ):
269
270
270
- logger .debug ("%d/%d" % (m + 1 , iterations ))
271
+ logger .debug ("%d/%d" % (m + 1 , iterations ))
271
272
rand .seed (m )
272
273
273
274
text = []
@@ -302,6 +303,6 @@ def test_random_bytes(model:LibLlamaModel, tokenizer:PreTrainedTokenizerBase, it
302
303
test_random_chars (model , tokenizer , 10_000 )
303
304
test_random_vocab_chars (model , tokenizer , 10_000 )
304
305
test_random_vocab_tokens (model , tokenizer , 10_000 )
305
- #test_random_bytes(model, tokenizer, 10_000) # FAIL
306
+ # test_random_bytes(model, tokenizer, 10_000) # FAIL
306
307
307
308
model .free ()
0 commit comments