Skip to content

Commit 54ebe70

Browse files
committed
Merge branch 'master' into gg/ggml-common-decl
2 parents 7741456 + 44ca159 commit 54ebe70

29 files changed

+2315
-1259
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,6 +425,8 @@ jobs:
425425
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON -DBUILD_SHARED_LIBS=ON'
426426
- build: 'vulkan'
427427
defines: '-DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON'
428+
- build: 'arm64'
429+
defines: '-A ARM64 -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DBUILD_SHARED_LIBS=ON'
428430

429431
steps:
430432
- name: Clone
@@ -520,7 +522,7 @@ jobs:
520522
- name: Test
521523
id: cmake_test
522524
# not all machines have native AVX-512
523-
if: ${{ matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
525+
if: ${{ matrix.build != 'arm64' && matrix.build != 'clblast' && matrix.build != 'kompute' && matrix.build != 'vulkan' && (matrix.build != 'avx512' || env.HAS_AVX512F == '1') }}
524526
run: |
525527
cd build
526528
ctest -L main -C Release --verbose --timeout 900

CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1141,6 +1141,8 @@ endif()
11411141
add_library(llama
11421142
llama.cpp
11431143
llama.h
1144+
unicode.h
1145+
unicode.cpp
11441146
)
11451147

11461148
target_include_directories(llama PUBLIC .)

Makefile

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -633,9 +633,12 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
633633
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
634634
$(CC) $(CFLAGS) -c $< -o $@
635635

636-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
636+
unicode.o: unicode.cpp unicode.h
637+
$(CXX) $(CXXFLAGS) -c $< -o $@
638+
639+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
637640

638-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
641+
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
639642
$(CXX) $(CXXFLAGS) -c $< -o $@
640643

641644
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h

Package.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ let package = Package(
3131
sources: [
3232
"ggml.c",
3333
"llama.cpp",
34+
"unicode.cpp",
3435
"ggml-alloc.c",
3536
"ggml-backend.c",
3637
"ggml-quants.c",

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Recent API changes
1212

13-
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
13+
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
1414
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
1515
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
1616

build.zig

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ pub fn build(b: *std.build.Builder) !void {
115115
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
116116
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
117117
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
118+
const unicode = make.obj("unicode", "unicode.cpp");
118119
const llama = make.obj("llama", "llama.cpp");
119120
const buildinfo = make.obj("common", "common/build-info.cpp");
120121
const common = make.obj("common", "common/common.cpp");
@@ -125,14 +126,14 @@ pub fn build(b: *std.build.Builder) !void {
125126
const clip = make.obj("clip", "examples/llava/clip.cpp");
126127
const llava = make.obj("llava", "examples/llava/llava.cpp");
127128

128-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
129-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
130-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
131-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
132-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
133-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
129+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
130+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
131+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
132+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
133+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
134+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
134135

135-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
136+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, clip, llava });
136137
if (server.target.isWindows()) {
137138
server.linkSystemLibrary("ws2_32");
138139
}

common/common.cpp

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1288,7 +1288,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12881288

12891289
cparams.n_ctx = params.n_ctx;
12901290
cparams.n_batch = params.n_batch;
1291-
cparams.n_parallel = params.n_parallel;
1291+
cparams.n_seq_max = params.n_parallel;
12921292
cparams.n_threads = params.n_threads;
12931293
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
12941294
cparams.seed = params.seed;
@@ -1786,17 +1786,17 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
17861786
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
17871787

17881788
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1789-
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1789+
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
17901790

17911791
llama_kv_cache_view_cell * c_curr = view.cells;
17921792
llama_seq_id * cs_curr = view.cells_sequences;
17931793

1794-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1794+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
17951795
if (i % row_size == 0) {
17961796
printf("\n%5d: ", i);
17971797
}
17981798
int seq_count = 0;
1799-
for (int j = 0; j < view.n_max_seq; j++) {
1799+
for (int j = 0; j < view.n_seq_max; j++) {
18001800
if (cs_curr[j] >= 0) { seq_count++; }
18011801
}
18021802
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
@@ -1809,14 +1809,14 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18091809
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
18101810

18111811
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1812-
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1812+
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
18131813

18141814
std::unordered_map<llama_seq_id, size_t> seqs;
18151815
llama_kv_cache_view_cell * c_curr = view.cells;
18161816
llama_seq_id * cs_curr = view.cells_sequences;
18171817

1818-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1819-
for (int j = 0; j < view.n_max_seq; j++) {
1818+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1819+
for (int j = 0; j < view.n_seq_max; j++) {
18201820
if (cs_curr[j] < 0) { continue; }
18211821
if (seqs.find(cs_curr[j]) == seqs.end()) {
18221822
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
@@ -1835,11 +1835,11 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18351835

18361836
c_curr = view.cells;
18371837
cs_curr = view.cells_sequences;
1838-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1838+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
18391839
if (i % row_size == 0) {
18401840
printf("\n%5d: ", i);
18411841
}
1842-
for (int j = 0; j < view.n_max_seq; j++) {
1842+
for (int j = 0; j < view.n_seq_max; j++) {
18431843
if (cs_curr[j] >= 0) {
18441844
const auto & it = seqs.find(cs_curr[j]);
18451845
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');

examples/batched-bench/batched-bench.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
106106
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
107107

108108
// ensure enough sequences are available
109-
ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end());
109+
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
110110

111111
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
112112

examples/batched/batched.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
8080
ctx_params.seed = 1234;
8181
ctx_params.n_ctx = n_kv_req;
8282
ctx_params.n_batch = std::max(n_len, n_parallel);
83-
ctx_params.n_parallel = n_parallel;
83+
ctx_params.n_seq_max = n_parallel;
8484
ctx_params.n_threads = params.n_threads;
8585
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
8686

examples/main/main.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,7 @@ int main(int argc, char ** argv) {
878878
const auto line_pfx = ::llama_tokenize(ctx, params.input_prefix, false, true);
879879
const auto line_inp = ::llama_tokenize(ctx, buffer, false, false);
880880
const auto line_sfx = ::llama_tokenize(ctx, params.input_suffix, false, true);
881+
881882
LOG("input tokens: %s\n", LOG_TOKENS_TOSTR_PRETTY(ctx, line_inp).c_str());
882883

883884
embd_inp.insert(embd_inp.end(), line_pfx.begin(), line_pfx.end());

examples/perplexity/perplexity.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -841,7 +841,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
841841
const int n_batch = params.n_batch;
842842

843843
const int max_tasks_per_batch = 32;
844-
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
844+
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
845845

846846
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
847847

@@ -1118,7 +1118,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
11181118
const int n_batch = params.n_batch;
11191119

11201120
const int max_tasks_per_batch = 128;
1121-
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
1121+
const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
11221122

11231123
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
11241124

@@ -1470,7 +1470,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
14701470
const int n_batch = params.n_batch;
14711471

14721472
const int max_tasks_per_batch = 32;
1473-
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
1473+
const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_seq_max(ctx));
14741474

14751475
llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
14761476

examples/server/README.md

Lines changed: 46 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -123,10 +123,10 @@ You can consume the endpoints with Postman or NodeJS with axios library. You can
123123
### Docker
124124

125125
```bash
126-
docker run -p 8080:8080 -v /path/to/models:/models ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
126+
docker run -p 8080:8080 -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:server -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080
127127

128128
# or, with CUDA:
129-
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
129+
docker run -p 8080:8080 -v /path/to/models:/models --gpus all ghcr.io/ggerganov/llama.cpp:server-cuda -m models/7B/ggml-model.gguf -c 512 --host 0.0.0.0 --port 8080 --n-gpu-layers 99
130130
```
131131

132132
## Testing with CURL
@@ -556,9 +556,51 @@ Run with bash:
556556
bash chat.sh
557557
```
558558

559-
### API like OAI
559+
### OAI-like API
560560

561-
The HTTP server supports OAI-like API
561+
The HTTP server supports OAI-like API: https://github.com/openai/openai-openapi
562+
563+
### API errors
564+
565+
Server returns error in the same format as OAI: https://github.com/openai/openai-openapi
566+
567+
Example of an error:
568+
569+
```json
570+
{
571+
"error": {
572+
"code": 401,
573+
"message": "Invalid API Key",
574+
"type": "authentication_error"
575+
}
576+
}
577+
```
578+
579+
Apart from error types supported by OAI, we also have custom types that are specific to functionalities of llama.cpp:
580+
581+
**When /metrics or /slots endpoint is disabled**
582+
583+
```json
584+
{
585+
"error": {
586+
"code": 501,
587+
"message": "This server does not support metrics endpoint.",
588+
"type": "not_supported_error"
589+
}
590+
}
591+
```
592+
593+
**When the server receives invalid grammar via */completions endpoint**
594+
595+
```json
596+
{
597+
"error": {
598+
"code": 400,
599+
"message": "Failed to parse grammar",
600+
"type": "invalid_request_error"
601+
}
602+
}
603+
```
562604

563605
### Extending or building alternative Web Front End
564606

0 commit comments

Comments
 (0)