Skip to content

Commit 28d9d38

Browse files
authored
Merge pull request #5 from l3utterfly/master
merged upstream
2 parents 2c1c46a + 19885d2 commit 28d9d38

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+3900
-2801
lines changed

.github/workflows/tidy-post.yml

Lines changed: 0 additions & 20 deletions
This file was deleted.

.github/workflows/tidy-review.yml

Lines changed: 0 additions & 23 deletions
This file was deleted.

CMakeLists.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,7 @@ option(LLAMA_SYCL "llama: use SYCL"
118118
option(LLAMA_SYCL_F16 "llama: use 16 bit floats for sycl calculations" OFF)
119119
set(LLAMA_SYCL_TARGET "INTEL" CACHE STRING "llama: sycl target device")
120120
option(LLAMA_CPU_HBM "llama: use memkind for CPU HBM" OFF)
121+
set(LLAMA_SCHED_MAX_COPIES "4" CACHE STRING "llama: max input copies for pipeline parallelism")
121122

122123
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
123124
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
@@ -147,6 +148,8 @@ set(THREADS_PREFER_PTHREAD_FLAG ON)
147148
find_package(Threads REQUIRED)
148149
include(CheckCXXCompilerFlag)
149150

151+
add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES})
152+
150153
# enable libstdc++ assertions for debug builds
151154
if (CMAKE_SYSTEM_NAME MATCHES "Linux")
152155
add_compile_definitions($<$<CONFIG:Debug>:_GLIBCXX_ASSERTIONS>)
@@ -1085,6 +1088,8 @@ endif()
10851088
add_library(llama
10861089
llama.cpp
10871090
llama.h
1091+
unicode.h
1092+
unicode.cpp
10881093
)
10891094

10901095
target_include_directories(llama PUBLIC .)

Makefile

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,10 @@ ifeq ($(UNAME_S),OpenBSD)
167167
MK_CPPFLAGS += -D_BSD_SOURCE
168168
endif
169169

170+
ifdef LLAMA_SCHED_MAX_COPIES
171+
MK_CPPFLAGS += -DGGML_SCHED_MAX_COPIES=$(LLAMA_SCHED_MAX_COPIES)
172+
endif
173+
170174
ifdef LLAMA_DEBUG
171175
MK_CFLAGS += -O0 -g
172176
MK_CXXFLAGS += -O0 -g
@@ -633,9 +637,12 @@ ggml-backend.o: ggml-backend.c ggml.h ggml-backend.h
633637
ggml-quants.o: ggml-quants.c ggml.h ggml-quants.h ggml-common.h
634638
$(CC) $(CFLAGS) -c $< -o $@
635639

636-
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o
640+
unicode.o: unicode.cpp unicode.h
641+
$(CXX) $(CXXFLAGS) -c $< -o $@
642+
643+
OBJS += ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o
637644

638-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
645+
llama.o: llama.cpp unicode.h ggml.h ggml-alloc.h ggml-backend.h ggml-cuda.h ggml-metal.h llama.h
639646
$(CXX) $(CXXFLAGS) -c $< -o $@
640647

641648
COMMON_H_DEPS = common/common.h common/sampling.h common/log.h

Package.swift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ let package = Package(
3131
sources: [
3232
"ggml.c",
3333
"llama.cpp",
34+
"unicode.cpp",
3435
"ggml-alloc.c",
3536
"ggml-backend.c",
3637
"ggml-quants.c",

README.md

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1010

1111
### Recent API changes
1212

13-
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
13+
- [2024 Mar 13] Add `llama_synchronize()` + `llama_context_params.n_ubatch` https://github.com/ggerganov/llama.cpp/pull/6017
14+
- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_seq_max()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
1415
- [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
1516
- [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
1617

1718
### Hot topics
1819

20+
- Multi-GPU pipeline parallelizm support https://github.com/ggerganov/llama.cpp/pull/6017
1921
- Looking for contributions to add Deepseek support: https://github.com/ggerganov/llama.cpp/issues/5981
2022
- Quantization blind testing: https://github.com/ggerganov/llama.cpp/discussions/5962
2123
- Initial Mamba support has been added: https://github.com/ggerganov/llama.cpp/pull/5328
@@ -902,6 +904,9 @@ First, install the essential packages for termux:
902904
pkg install clang wget git cmake
903905
```
904906
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
907+
908+
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
909+
905910
```
906911
$ mkdir build-android
907912
$ cd build-android
@@ -910,7 +915,28 @@ $ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROI
910915
$ make
911916
```
912917
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
913-
Finally, copy the `llama` binary and the model files to your device storage. Here is a demo of an interactive session running on Pixel 5 phone:
918+
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
919+
920+
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
921+
```
922+
$cp -r /sdcard/llama.cpp/bin /data/data/com.termux/files/home/
923+
$cd /data/data/com.termux/files/home/bin
924+
$chmod +x ./*
925+
```
926+
927+
Download model [llama-2-7b-chat.Q4_K_M.gguf](https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/blob/main/llama-2-7b-chat.Q4_K_M.gguf), and push it to `/sdcard/llama.cpp/`, then move it to `/data/data/com.termux/files/home/model/`
928+
929+
```
930+
$mv /sdcard/llama.cpp/llama-2-7b-chat.Q4_K_M.gguf /data/data/com.termux/files/home/model/
931+
```
932+
933+
Now, you can start chatting:
934+
```
935+
$cd /data/data/com.termux/files/home/bin
936+
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
937+
```
938+
939+
Here is a demo of an interactive session running on Pixel 5 phone:
914940
915941
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
916942

build.zig

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,6 +115,7 @@ pub fn build(b: *std.build.Builder) !void {
115115
const ggml_alloc = make.obj("ggml-alloc", "ggml-alloc.c");
116116
const ggml_backend = make.obj("ggml-backend", "ggml-backend.c");
117117
const ggml_quants = make.obj("ggml-quants", "ggml-quants.c");
118+
const unicode = make.obj("unicode", "unicode.cpp");
118119
const llama = make.obj("llama", "llama.cpp");
119120
const buildinfo = make.obj("common", "common/build-info.cpp");
120121
const common = make.obj("common", "common/common.cpp");
@@ -125,14 +126,14 @@ pub fn build(b: *std.build.Builder) !void {
125126
const clip = make.obj("clip", "examples/llava/clip.cpp");
126127
const llava = make.obj("llava", "examples/llava/llava.cpp");
127128

128-
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, console, grammar_parser });
129-
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
130-
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
131-
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo });
132-
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
133-
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, train });
129+
_ = make.exe("main", "examples/main/main.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, console, grammar_parser });
130+
_ = make.exe("quantize", "examples/quantize/quantize.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
131+
_ = make.exe("perplexity", "examples/perplexity/perplexity.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
132+
_ = make.exe("embedding", "examples/embedding/embedding.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo });
133+
_ = make.exe("finetune", "examples/finetune/finetune.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
134+
_ = make.exe("train-text-from-scratch", "examples/train-text-from-scratch/train-text-from-scratch.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, train });
134135

135-
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, common, buildinfo, sampling, grammar_parser, clip, llava });
136+
const server = make.exe("server", "examples/server/server.cpp", &.{ ggml, ggml_alloc, ggml_backend, ggml_quants, llama, unicode, common, buildinfo, sampling, grammar_parser, clip, llava });
136137
if (server.target.isWindows()) {
137138
server.linkSystemLibrary("ws2_32");
138139
}

common/common.cpp

Lines changed: 20 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
483483
break;
484484
}
485485
params.n_batch = std::stoi(argv[i]);
486+
} else if (arg == "-ub" || arg == "--ubatch-size") {
487+
if (++i >= argc) {
488+
invalid_param = true;
489+
break;
490+
}
491+
params.n_ubatch = std::stoi(argv[i]);
486492
} else if (arg == "--keep") {
487493
if (++i >= argc) {
488494
invalid_param = true;
@@ -977,7 +983,9 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
977983
printf(" binary file containing multiple choice tasks.\n");
978984
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
979985
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
980-
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
986+
printf(" -b N, --batch-size N logical maximum batch size (default: %d)\n", params.n_batch);
987+
printf(" -ub N, --ubatch-size N\n");
988+
printf(" physical maximum batch size (default: %d)\n", params.n_ubatch);
981989
printf(" --samplers samplers that will be used for generation in the order, separated by \';\'\n");
982990
printf(" (default: %s)\n", sampler_type_names.c_str());
983991
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
@@ -1287,8 +1295,9 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
12871295
auto cparams = llama_context_default_params();
12881296

12891297
cparams.n_ctx = params.n_ctx;
1298+
cparams.n_seq_max = params.n_parallel;
12901299
cparams.n_batch = params.n_batch;
1291-
cparams.n_parallel = params.n_parallel;
1300+
cparams.n_ubatch = params.n_ubatch;
12921301
cparams.n_threads = params.n_threads;
12931302
cparams.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
12941303
cparams.seed = params.seed;
@@ -1379,6 +1388,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
13791388
std::vector<llama_token> tmp = { llama_token_bos(model), llama_token_eos(model), };
13801389
llama_decode(lctx, llama_batch_get_one(tmp.data(), std::min(tmp.size(), (size_t) params.n_batch), 0, 0));
13811390
llama_kv_cache_clear(lctx);
1391+
llama_synchronize(lctx);
13821392
llama_reset_timings(lctx);
13831393
}
13841394

@@ -1786,17 +1796,17 @@ void dump_kv_cache_view(const llama_kv_cache_view & view, int row_size) {
17861796
static const char slot_chars[] = ".123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz+";
17871797

17881798
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d",
1789-
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1799+
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
17901800

17911801
llama_kv_cache_view_cell * c_curr = view.cells;
17921802
llama_seq_id * cs_curr = view.cells_sequences;
17931803

1794-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1804+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
17951805
if (i % row_size == 0) {
17961806
printf("\n%5d: ", i);
17971807
}
17981808
int seq_count = 0;
1799-
for (int j = 0; j < view.n_max_seq; j++) {
1809+
for (int j = 0; j < view.n_seq_max; j++) {
18001810
if (cs_curr[j] >= 0) { seq_count++; }
18011811
}
18021812
putchar(slot_chars[std::min(sizeof(slot_chars) - 2, size_t(seq_count))]);
@@ -1809,14 +1819,14 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18091819
static const char slot_chars[] = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz";
18101820

18111821
printf("=== Dumping KV cache. total cells %d, max sequences per cell %d, populated cells %d, total tokens in cache %d, largest empty slot=%d @ %d\n",
1812-
view.n_cells, view.n_max_seq, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
1822+
view.n_cells, view.n_seq_max, view.used_cells, view.token_count, view.max_contiguous, view.max_contiguous_idx);
18131823

18141824
std::unordered_map<llama_seq_id, size_t> seqs;
18151825
llama_kv_cache_view_cell * c_curr = view.cells;
18161826
llama_seq_id * cs_curr = view.cells_sequences;
18171827

1818-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1819-
for (int j = 0; j < view.n_max_seq; j++) {
1828+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
1829+
for (int j = 0; j < view.n_seq_max; j++) {
18201830
if (cs_curr[j] < 0) { continue; }
18211831
if (seqs.find(cs_curr[j]) == seqs.end()) {
18221832
if (seqs.size() + 1 >= sizeof(slot_chars)) { break; }
@@ -1835,11 +1845,11 @@ void dump_kv_cache_view_seqs(const llama_kv_cache_view & view, int row_size) {
18351845

18361846
c_curr = view.cells;
18371847
cs_curr = view.cells_sequences;
1838-
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_max_seq) {
1848+
for (int i = 0; i < view.n_cells; i++, c_curr++, cs_curr += view.n_seq_max) {
18391849
if (i % row_size == 0) {
18401850
printf("\n%5d: ", i);
18411851
}
1842-
for (int j = 0; j < view.n_max_seq; j++) {
1852+
for (int j = 0; j < view.n_seq_max; j++) {
18431853
if (cs_curr[j] >= 0) {
18441854
const auto & it = seqs.find(cs_curr[j]);
18451855
putchar(it != seqs.end() ? int(slot_chars[it->second]) : '+');

common/common.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,8 @@ struct gpt_params {
5151
int32_t n_threads_batch_draft = -1;
5252
int32_t n_predict = -1; // new tokens to predict
5353
int32_t n_ctx = 512; // context size
54-
int32_t n_batch = 512; // batch size for prompt processing (must be >=32 to use BLAS)
54+
int32_t n_batch = 2048; // logical batch size for prompt processing (must be >=32 to use BLAS)
55+
int32_t n_ubatch = 512; // physical batch size for prompt processing (must be >=32 to use BLAS)
5556
int32_t n_keep = 0; // number of tokens to keep from initial prompt
5657
int32_t n_draft = 5; // number of tokens to draft during speculative decoding
5758
int32_t n_chunks = -1; // max number of chunks to process (-1 = unlimited)

common/sampling.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,13 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
1717
return nullptr;
1818
}
1919

20+
// Ensure that there is a "root" node.
21+
if (result->parsed_grammar.symbol_ids.find("root") == result->parsed_grammar.symbol_ids.end()) {
22+
fprintf(stderr, "%s: grammar does not contain a 'root' symbol\n", __func__);
23+
delete result;
24+
return nullptr;
25+
}
26+
2027
std::vector<const llama_grammar_element *> grammar_rules(result->parsed_grammar.c_rules());
2128

2229
result->grammar = llama_grammar_init(

examples/batched-bench/batched-bench.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ int main(int argc, char ** argv) {
106106
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
107107

108108
// ensure enough sequences are available
109-
ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end());
109+
ctx_params.n_seq_max = *std::max_element(n_pl.begin(), n_pl.end());
110110

111111
llama_context * ctx = llama_new_context_with_model(model, ctx_params);
112112

@@ -138,6 +138,8 @@ int main(int argc, char ** argv) {
138138
LOG_TEE("failed to decode the batch, n_batch = %d, ret = %d\n", n_batch, ret);
139139
return false;
140140
}
141+
142+
llama_synchronize(ctx);
141143
}
142144

143145
return true;

examples/batched/batched.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ int main(int argc, char ** argv) {
8080
ctx_params.seed = 1234;
8181
ctx_params.n_ctx = n_kv_req;
8282
ctx_params.n_batch = std::max(n_len, n_parallel);
83-
ctx_params.n_parallel = n_parallel;
83+
ctx_params.n_seq_max = n_parallel;
8484
ctx_params.n_threads = params.n_threads;
8585
ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
8686

examples/embedding/embedding.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@ int main(int argc, char ** argv) {
107107

108108
// max batch size
109109
const uint64_t n_batch = params.n_batch;
110-
GGML_ASSERT(params.n_batch == params.n_ctx);
110+
GGML_ASSERT(params.n_batch >= params.n_ctx);
111111

112112
// tokenize the prompts and trim
113113
std::vector<std::vector<int32_t>> inputs;

0 commit comments

Comments
 (0)