Skip to content

Commit 2920c4b

Browse files
committed
Update server params. Added lora_base, lora_path, low_vram, and main_gpu. Removed rms_norm_eps and n_gqa (deprecated in llama.cpp)
1 parent 6a20293 commit 2920c4b

File tree

1 file changed

+44
-37
lines changed

1 file changed

+44
-37
lines changed

llama_cpp/server/app.py

Lines changed: 44 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -34,12 +34,21 @@ class Settings(BaseSettings):
3434
default=None,
3535
description="The alias of the model to use for generating completions.",
3636
)
37+
seed: int = Field(default=llama_cpp.LLAMA_DEFAULT_SEED, description="Random seed. -1 for random.")
3738
n_ctx: int = Field(default=2048, ge=1, description="The context size.")
39+
n_batch: int = Field(
40+
default=512, ge=1, description="The batch size to use per eval."
41+
)
3842
n_gpu_layers: int = Field(
3943
default=0,
4044
ge=0,
4145
description="The number of layers to put on the GPU. The rest will be on the CPU.",
4246
)
47+
main_gpu: int = Field(
48+
default=0,
49+
ge=0,
50+
description="Main GPU to use.",
51+
)
4352
tensor_split: Optional[List[float]] = Field(
4453
default=None,
4554
description="Split layers across multiple GPUs in proportion.",
@@ -50,35 +59,45 @@ class Settings(BaseSettings):
5059
rope_freq_scale: float = Field(
5160
default=1.0, description="RoPE frequency scaling factor"
5261
)
53-
seed: int = Field(default=1337, description="Random seed. -1 for random.")
54-
n_batch: int = Field(
55-
default=512, ge=1, description="The batch size to use per eval."
62+
low_vram: bool = Field(
63+
default=False,
64+
description="Whether to use less VRAM. This will reduce performance.",
5665
)
57-
n_threads: int = Field(
58-
default=max(multiprocessing.cpu_count() // 2, 1),
59-
ge=1,
60-
description="The number of threads to use.",
66+
mul_mat_q: bool = Field(
67+
default=True, description="if true, use experimental mul_mat_q kernels"
6168
)
6269
f16_kv: bool = Field(default=True, description="Whether to use f16 key/value.")
63-
use_mlock: bool = Field(
64-
default=llama_cpp.llama_mlock_supported(),
65-
description="Use mlock.",
70+
logits_all: bool = Field(default=True, description="Whether to return logits.")
71+
vocab_only: bool = Field(
72+
default=False, description="Whether to only return the vocabulary."
6673
)
6774
use_mmap: bool = Field(
6875
default=llama_cpp.llama_mmap_supported(),
6976
description="Use mmap.",
7077
)
78+
use_mlock: bool = Field(
79+
default=llama_cpp.llama_mlock_supported(),
80+
description="Use mlock.",
81+
)
7182
embedding: bool = Field(default=True, description="Whether to use embeddings.")
72-
low_vram: bool = Field(
73-
default=False,
74-
description="Whether to use less VRAM. This will reduce performance.",
83+
n_threads: int = Field(
84+
default=max(multiprocessing.cpu_count() // 2, 1),
85+
ge=1,
86+
description="The number of threads to use.",
7587
)
7688
last_n_tokens_size: int = Field(
7789
default=64,
7890
ge=0,
7991
description="Last n tokens to keep for repeat penalty calculation.",
8092
)
81-
logits_all: bool = Field(default=True, description="Whether to return logits.")
93+
lora_base: Optional[str] = Field(
94+
default=None,
95+
description="Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
96+
)
97+
lora_path: Optional[str] = Field(
98+
default=None,
99+
description="Path to a LoRA file to apply to the model.",
100+
)
82101
cache: bool = Field(
83102
default=False,
84103
description="Use a cache to reduce processing times for evaluated prompts.",
@@ -91,9 +110,6 @@ class Settings(BaseSettings):
91110
default=2 << 30,
92111
description="The size of the cache in bytes. Only used if cache is True.",
93112
)
94-
vocab_only: bool = Field(
95-
default=False, description="Whether to only return the vocabulary."
96-
)
97113
verbose: bool = Field(
98114
default=True, description="Whether to print debug information."
99115
)
@@ -103,18 +119,6 @@ class Settings(BaseSettings):
103119
default=True,
104120
description="Whether to interrupt requests when a new request is received.",
105121
)
106-
n_gqa: Optional[int] = Field(
107-
default=None,
108-
description="TEMPORARY: Set to 8 for Llama2 70B",
109-
)
110-
rms_norm_eps: Optional[float] = Field(
111-
default=None,
112-
description="TEMPORARY",
113-
)
114-
mul_mat_q: Optional[bool] = Field(
115-
default=None,
116-
description="TEMPORARY",
117-
)
118122

119123

120124
class ErrorResponse(TypedDict):
@@ -334,24 +338,27 @@ def create_app(settings: Optional[Settings] = None):
334338
global llama
335339
llama = llama_cpp.Llama(
336340
model_path=settings.model,
341+
seed=settings.seed,
342+
n_ctx=settings.n_ctx,
343+
n_batch=settings.n_batch,
337344
n_gpu_layers=settings.n_gpu_layers,
345+
main_gpu=settings.main_gpu,
338346
tensor_split=settings.tensor_split,
339347
rope_freq_base=settings.rope_freq_base,
340348
rope_freq_scale=settings.rope_freq_scale,
341-
seed=settings.seed,
349+
low_vram=settings.low_vram,
350+
mul_mat_q=settings.mul_mat_q,
342351
f16_kv=settings.f16_kv,
343-
use_mlock=settings.use_mlock,
352+
logits_all=settings.logits_all,
353+
vocab_only=settings.vocab_only,
344354
use_mmap=settings.use_mmap,
355+
use_mlock=settings.use_mlock,
345356
embedding=settings.embedding,
346-
logits_all=settings.logits_all,
347357
n_threads=settings.n_threads,
348-
n_batch=settings.n_batch,
349-
n_ctx=settings.n_ctx,
350358
last_n_tokens_size=settings.last_n_tokens_size,
351-
vocab_only=settings.vocab_only,
359+
lora_base=settings.lora_base,
360+
lora_path=settings.lora_path,
352361
verbose=settings.verbose,
353-
n_gqa=settings.n_gqa,
354-
rms_norm_eps=settings.rms_norm_eps,
355362
)
356363
if settings.cache:
357364
if settings.cache_type == "disk":

0 commit comments

Comments
 (0)