@@ -34,12 +34,21 @@ class Settings(BaseSettings):
34
34
default = None ,
35
35
description = "The alias of the model to use for generating completions." ,
36
36
)
37
+ seed : int = Field (default = llama_cpp .LLAMA_DEFAULT_SEED , description = "Random seed. -1 for random." )
37
38
n_ctx : int = Field (default = 2048 , ge = 1 , description = "The context size." )
39
+ n_batch : int = Field (
40
+ default = 512 , ge = 1 , description = "The batch size to use per eval."
41
+ )
38
42
n_gpu_layers : int = Field (
39
43
default = 0 ,
40
44
ge = 0 ,
41
45
description = "The number of layers to put on the GPU. The rest will be on the CPU." ,
42
46
)
47
+ main_gpu : int = Field (
48
+ default = 0 ,
49
+ ge = 0 ,
50
+ description = "Main GPU to use." ,
51
+ )
43
52
tensor_split : Optional [List [float ]] = Field (
44
53
default = None ,
45
54
description = "Split layers across multiple GPUs in proportion." ,
@@ -50,35 +59,45 @@ class Settings(BaseSettings):
50
59
rope_freq_scale : float = Field (
51
60
default = 1.0 , description = "RoPE frequency scaling factor"
52
61
)
53
- seed : int = Field (default = 1337 , description = "Random seed. -1 for random." )
54
- n_batch : int = Field (
55
- default = 512 , ge = 1 , description = "The batch size to use per eval."
62
+ low_vram : bool = Field (
63
+ default = False ,
64
+ description = "Whether to use less VRAM. This will reduce performance." ,
56
65
)
57
- n_threads : int = Field (
58
- default = max (multiprocessing .cpu_count () // 2 , 1 ),
59
- ge = 1 ,
60
- description = "The number of threads to use." ,
66
+ mul_mat_q : bool = Field (
67
+ default = True , description = "if true, use experimental mul_mat_q kernels"
61
68
)
62
69
f16_kv : bool = Field (default = True , description = "Whether to use f16 key/value." )
63
- use_mlock : bool = Field (
64
- default = llama_cpp . llama_mlock_supported (),
65
- description = "Use mlock." ,
70
+ logits_all : bool = Field (default = True , description = "Whether to return logits." )
71
+ vocab_only : bool = Field (
72
+ default = False , description = "Whether to only return the vocabulary."
66
73
)
67
74
use_mmap : bool = Field (
68
75
default = llama_cpp .llama_mmap_supported (),
69
76
description = "Use mmap." ,
70
77
)
78
+ use_mlock : bool = Field (
79
+ default = llama_cpp .llama_mlock_supported (),
80
+ description = "Use mlock." ,
81
+ )
71
82
embedding : bool = Field (default = True , description = "Whether to use embeddings." )
72
- low_vram : bool = Field (
73
- default = False ,
74
- description = "Whether to use less VRAM. This will reduce performance." ,
83
+ n_threads : int = Field (
84
+ default = max (multiprocessing .cpu_count () // 2 , 1 ),
85
+ ge = 1 ,
86
+ description = "The number of threads to use." ,
75
87
)
76
88
last_n_tokens_size : int = Field (
77
89
default = 64 ,
78
90
ge = 0 ,
79
91
description = "Last n tokens to keep for repeat penalty calculation." ,
80
92
)
81
- logits_all : bool = Field (default = True , description = "Whether to return logits." )
93
+ lora_base : Optional [str ] = Field (
94
+ default = None ,
95
+ description = "Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model."
96
+ )
97
+ lora_path : Optional [str ] = Field (
98
+ default = None ,
99
+ description = "Path to a LoRA file to apply to the model." ,
100
+ )
82
101
cache : bool = Field (
83
102
default = False ,
84
103
description = "Use a cache to reduce processing times for evaluated prompts." ,
@@ -91,9 +110,6 @@ class Settings(BaseSettings):
91
110
default = 2 << 30 ,
92
111
description = "The size of the cache in bytes. Only used if cache is True." ,
93
112
)
94
- vocab_only : bool = Field (
95
- default = False , description = "Whether to only return the vocabulary."
96
- )
97
113
verbose : bool = Field (
98
114
default = True , description = "Whether to print debug information."
99
115
)
@@ -103,18 +119,6 @@ class Settings(BaseSettings):
103
119
default = True ,
104
120
description = "Whether to interrupt requests when a new request is received." ,
105
121
)
106
- n_gqa : Optional [int ] = Field (
107
- default = None ,
108
- description = "TEMPORARY: Set to 8 for Llama2 70B" ,
109
- )
110
- rms_norm_eps : Optional [float ] = Field (
111
- default = None ,
112
- description = "TEMPORARY" ,
113
- )
114
- mul_mat_q : Optional [bool ] = Field (
115
- default = None ,
116
- description = "TEMPORARY" ,
117
- )
118
122
119
123
120
124
class ErrorResponse (TypedDict ):
@@ -334,24 +338,27 @@ def create_app(settings: Optional[Settings] = None):
334
338
global llama
335
339
llama = llama_cpp .Llama (
336
340
model_path = settings .model ,
341
+ seed = settings .seed ,
342
+ n_ctx = settings .n_ctx ,
343
+ n_batch = settings .n_batch ,
337
344
n_gpu_layers = settings .n_gpu_layers ,
345
+ main_gpu = settings .main_gpu ,
338
346
tensor_split = settings .tensor_split ,
339
347
rope_freq_base = settings .rope_freq_base ,
340
348
rope_freq_scale = settings .rope_freq_scale ,
341
- seed = settings .seed ,
349
+ low_vram = settings .low_vram ,
350
+ mul_mat_q = settings .mul_mat_q ,
342
351
f16_kv = settings .f16_kv ,
343
- use_mlock = settings .use_mlock ,
352
+ logits_all = settings .logits_all ,
353
+ vocab_only = settings .vocab_only ,
344
354
use_mmap = settings .use_mmap ,
355
+ use_mlock = settings .use_mlock ,
345
356
embedding = settings .embedding ,
346
- logits_all = settings .logits_all ,
347
357
n_threads = settings .n_threads ,
348
- n_batch = settings .n_batch ,
349
- n_ctx = settings .n_ctx ,
350
358
last_n_tokens_size = settings .last_n_tokens_size ,
351
- vocab_only = settings .vocab_only ,
359
+ lora_base = settings .lora_base ,
360
+ lora_path = settings .lora_path ,
352
361
verbose = settings .verbose ,
353
- n_gqa = settings .n_gqa ,
354
- rms_norm_eps = settings .rms_norm_eps ,
355
362
)
356
363
if settings .cache :
357
364
if settings .cache_type == "disk" :
0 commit comments