@@ -214,54 +214,55 @@ def __init__(
214
214
model_path : str ,
215
215
* ,
216
216
# NOTE: These parameters are likely to change in the future.
217
+ seed : int = llama_cpp .LLAMA_DEFAULT_SEED ,
217
218
n_ctx : int = 512 ,
218
- n_parts : int = - 1 ,
219
+ n_batch : int = 512 ,
219
220
n_gpu_layers : int = 0 ,
220
- seed : int = 1337 ,
221
+ main_gpu : int = 0 ,
222
+ tensor_split : Optional [List [float ]] = None ,
223
+ rope_freq_base : float = 10000.0 ,
224
+ rope_freq_scale : float = 1.0 ,
225
+ low_vram : bool = False ,
226
+ mul_mat_q : bool = True ,
221
227
f16_kv : bool = True ,
222
228
logits_all : bool = False ,
223
229
vocab_only : bool = False ,
224
230
use_mmap : bool = True ,
225
231
use_mlock : bool = False ,
226
232
embedding : bool = False ,
227
233
n_threads : Optional [int ] = None ,
228
- n_batch : int = 512 ,
229
234
last_n_tokens_size : int = 64 ,
230
235
lora_base : Optional [str ] = None ,
231
236
lora_path : Optional [str ] = None ,
232
- low_vram : bool = False ,
233
- tensor_split : Optional [List [float ]] = None ,
234
- rope_freq_base : float = 10000.0 ,
235
- rope_freq_scale : float = 1.0 ,
236
- n_gqa : Optional [int ] = None , # (TEMPORARY) must be 8 for llama2 70b
237
- rms_norm_eps : Optional [float ] = None , # (TEMPORARY)
238
- mul_mat_q : Optional [bool ] = None ,
239
237
verbose : bool = True ,
240
238
** kwargs # type: ignore
241
239
):
242
240
"""Load a llama.cpp model from `model_path`.
243
241
244
242
Args:
245
243
model_path: Path to the model.
246
- n_ctx: Maximum context size.
247
- n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
248
244
seed: Random seed. -1 for random.
245
+ n_ctx: Maximum context size.
246
+ n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
249
247
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
248
+ main_gpu: Main GPU to use.
249
+ tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split.
250
+ rope_freq_base: Base frequency for rope sampling.
251
+ rope_freq_scale: Scale factor for rope sampling.
252
+ low_vram: Use low VRAM mode.
253
+ mul_mat_q: if true, use experimental mul_mat_q kernels
250
254
f16_kv: Use half-precision for key/value cache.
251
255
logits_all: Return logits for all tokens, not just the last token.
252
256
vocab_only: Only load the vocabulary no weights.
253
257
use_mmap: Use mmap if possible.
254
258
use_mlock: Force the system to keep the model in RAM.
255
259
embedding: Embedding mode only.
256
260
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
257
- n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
258
261
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
259
262
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
260
263
lora_path: Path to a LoRA file to apply to the model.
261
- tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
262
- rope_freq_base: Base frequency for rope sampling.
263
- rope_freq_scale: Scale factor for rope sampling.
264
264
verbose: Print verbose output to stderr.
265
+ kwargs: Unused keyword arguments (for additional backwards compatibility).
265
266
266
267
Raises:
267
268
ValueError: If the model path does not exist.
@@ -274,16 +275,20 @@ def __init__(
274
275
self .model_path = model_path
275
276
276
277
self .params = llama_cpp .llama_context_default_params ()
278
+ self .params .seed = seed
277
279
self .params .n_ctx = n_ctx
278
280
self .params .n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == - 1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
279
- self .params .seed = seed
281
+ self .params .main_gpu = main_gpu
282
+ self .params .rope_freq_base = rope_freq_base
283
+ self .params .rope_freq_scale = rope_freq_scale
284
+ self .params .low_vram = low_vram
285
+ self .params .mul_mat_q = mul_mat_q
280
286
self .params .f16_kv = f16_kv
281
287
self .params .logits_all = logits_all
282
288
self .params .vocab_only = vocab_only
283
289
self .params .use_mmap = use_mmap if lora_path is None else False
284
290
self .params .use_mlock = use_mlock
285
291
self .params .embedding = embedding
286
- self .params .low_vram = low_vram
287
292
288
293
self .tensor_split = tensor_split
289
294
self ._p_tensor_split = None
@@ -296,12 +301,6 @@ def __init__(
296
301
) # keep a reference to the array so it is not gc'd
297
302
self .params .tensor_split = self ._c_tensor_split
298
303
299
- self .params .rope_freq_base = rope_freq_base
300
- self .params .rope_freq_scale = rope_freq_scale
301
-
302
-
303
- if mul_mat_q is not None :
304
- self .params .mul_mat_q = mul_mat_q
305
304
306
305
self .last_n_tokens_size = last_n_tokens_size
307
306
self .n_batch = min (n_ctx , n_batch )
@@ -313,10 +312,6 @@ def __init__(
313
312
self .lora_base = lora_base
314
313
self .lora_path = lora_path
315
314
316
- ### DEPRECATED ###
317
- self .n_parts = n_parts
318
- ### DEPRECATED ###
319
-
320
315
if not os .path .exists (model_path ):
321
316
raise ValueError (f"Model path does not exist: { model_path } " )
322
317
0 commit comments