Skip to content

Commit 6a20293

Browse files
committed
Reorder init params to match llama.cpp order
1 parent c8f9b8a commit 6a20293

File tree

1 file changed

+23
-28
lines changed

1 file changed

+23
-28
lines changed

llama_cpp/llama.py

Lines changed: 23 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -214,54 +214,55 @@ def __init__(
214214
model_path: str,
215215
*,
216216
# NOTE: These parameters are likely to change in the future.
217+
seed: int = llama_cpp.LLAMA_DEFAULT_SEED,
217218
n_ctx: int = 512,
218-
n_parts: int = -1,
219+
n_batch: int = 512,
219220
n_gpu_layers: int = 0,
220-
seed: int = 1337,
221+
main_gpu: int = 0,
222+
tensor_split: Optional[List[float]] = None,
223+
rope_freq_base: float = 10000.0,
224+
rope_freq_scale: float = 1.0,
225+
low_vram: bool = False,
226+
mul_mat_q: bool = True,
221227
f16_kv: bool = True,
222228
logits_all: bool = False,
223229
vocab_only: bool = False,
224230
use_mmap: bool = True,
225231
use_mlock: bool = False,
226232
embedding: bool = False,
227233
n_threads: Optional[int] = None,
228-
n_batch: int = 512,
229234
last_n_tokens_size: int = 64,
230235
lora_base: Optional[str] = None,
231236
lora_path: Optional[str] = None,
232-
low_vram: bool = False,
233-
tensor_split: Optional[List[float]] = None,
234-
rope_freq_base: float = 10000.0,
235-
rope_freq_scale: float = 1.0,
236-
n_gqa: Optional[int] = None, # (TEMPORARY) must be 8 for llama2 70b
237-
rms_norm_eps: Optional[float] = None, # (TEMPORARY)
238-
mul_mat_q: Optional[bool] = None,
239237
verbose: bool = True,
240238
**kwargs # type: ignore
241239
):
242240
"""Load a llama.cpp model from `model_path`.
243241
244242
Args:
245243
model_path: Path to the model.
246-
n_ctx: Maximum context size.
247-
n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
248244
seed: Random seed. -1 for random.
245+
n_ctx: Maximum context size.
246+
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
249247
n_gpu_layers: Number of layers to offload to GPU (-ngl). If -1, all layers are offloaded.
248+
main_gpu: Main GPU to use.
249+
tensor_split: Optional list of floats to split the model across multiple GPUs. If None, the model is not split.
250+
rope_freq_base: Base frequency for rope sampling.
251+
rope_freq_scale: Scale factor for rope sampling.
252+
low_vram: Use low VRAM mode.
253+
mul_mat_q: if true, use experimental mul_mat_q kernels
250254
f16_kv: Use half-precision for key/value cache.
251255
logits_all: Return logits for all tokens, not just the last token.
252256
vocab_only: Only load the vocabulary no weights.
253257
use_mmap: Use mmap if possible.
254258
use_mlock: Force the system to keep the model in RAM.
255259
embedding: Embedding mode only.
256260
n_threads: Number of threads to use. If None, the number of threads is automatically determined.
257-
n_batch: Maximum number of prompt tokens to batch together when calling llama_eval.
258261
last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
259262
lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
260263
lora_path: Path to a LoRA file to apply to the model.
261-
tensor_split: List of floats to split the model across multiple GPUs. If None, the model is not split.
262-
rope_freq_base: Base frequency for rope sampling.
263-
rope_freq_scale: Scale factor for rope sampling.
264264
verbose: Print verbose output to stderr.
265+
kwargs: Unused keyword arguments (for additional backwards compatibility).
265266
266267
Raises:
267268
ValueError: If the model path does not exist.
@@ -274,16 +275,20 @@ def __init__(
274275
self.model_path = model_path
275276

276277
self.params = llama_cpp.llama_context_default_params()
278+
self.params.seed = seed
277279
self.params.n_ctx = n_ctx
278280
self.params.n_gpu_layers = 0x7FFFFFFF if n_gpu_layers == -1 else n_gpu_layers # 0x7FFFFFFF is INT32 max, will be auto set to all layers
279-
self.params.seed = seed
281+
self.params.main_gpu = main_gpu
282+
self.params.rope_freq_base = rope_freq_base
283+
self.params.rope_freq_scale = rope_freq_scale
284+
self.params.low_vram = low_vram
285+
self.params.mul_mat_q = mul_mat_q
280286
self.params.f16_kv = f16_kv
281287
self.params.logits_all = logits_all
282288
self.params.vocab_only = vocab_only
283289
self.params.use_mmap = use_mmap if lora_path is None else False
284290
self.params.use_mlock = use_mlock
285291
self.params.embedding = embedding
286-
self.params.low_vram = low_vram
287292

288293
self.tensor_split = tensor_split
289294
self._p_tensor_split = None
@@ -296,12 +301,6 @@ def __init__(
296301
) # keep a reference to the array so it is not gc'd
297302
self.params.tensor_split = self._c_tensor_split
298303

299-
self.params.rope_freq_base = rope_freq_base
300-
self.params.rope_freq_scale = rope_freq_scale
301-
302-
303-
if mul_mat_q is not None:
304-
self.params.mul_mat_q = mul_mat_q
305304

306305
self.last_n_tokens_size = last_n_tokens_size
307306
self.n_batch = min(n_ctx, n_batch)
@@ -313,10 +312,6 @@ def __init__(
313312
self.lora_base = lora_base
314313
self.lora_path = lora_path
315314

316-
### DEPRECATED ###
317-
self.n_parts = n_parts
318-
### DEPRECATED ###
319-
320315
if not os.path.exists(model_path):
321316
raise ValueError(f"Model path does not exist: {model_path}")
322317

0 commit comments

Comments
 (0)