Skip to content

Commit 76515b7

Browse files
committed
Merge remote-tracking branch 'origin/master' into cuda-graph-allocr
2 parents 2bfb39a + c8dba40 commit 76515b7

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

56 files changed

+10232
-3177
lines changed

.gitignore

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
*.o
22
*.a
33
*.so
4+
*.gguf
45
*.bin
56
.DS_Store
67
.build/
@@ -47,6 +48,8 @@ models-mnt
4748
/server
4849
/Pipfile
4950
/embd-input-test
51+
/gguf
52+
/gguf-llama-simple
5053
/libllama.so
5154
/llama-bench
5255
build-info.h
@@ -65,7 +68,6 @@ perf-*.txt
6568

6669
examples/jeopardy/results.txt
6770

68-
6971
pyproject.toml
7072
poetry.lock
7173
poetry.toml

CMakeLists.txt

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -497,9 +497,11 @@ else()
497497
endif()
498498

499499
#
500-
# Build libraries
500+
# libraries
501501
#
502502

503+
# ggml
504+
503505
add_library(ggml OBJECT
504506
ggml.c
505507
ggml.h
@@ -524,10 +526,11 @@ if (BUILD_SHARED_LIBS)
524526
install(TARGETS ggml_shared LIBRARY)
525527
endif()
526528

529+
# llama
530+
527531
add_library(llama
528532
llama.cpp
529533
llama.h
530-
llama-util.h
531534
)
532535

533536
target_include_directories(llama PUBLIC .)
@@ -546,6 +549,10 @@ if (BUILD_SHARED_LIBS)
546549
install(TARGETS llama LIBRARY)
547550
endif()
548551

552+
#
553+
# install
554+
#
555+
549556
include(GNUInstallDirs)
550557
install(
551558
FILES convert.py
@@ -584,6 +591,8 @@ endif()
584591
# programs, examples and tests
585592
#
586593

594+
add_subdirectory(common)
595+
587596
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
588597
include(CTest)
589598
add_subdirectory(tests)

Makefile

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
# Define the default target now so that it is always the first target
2-
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test llama-bench
2+
BUILD_TARGETS = main quantize quantize-stats perplexity embedding vdot train-text-from-scratch convert-llama2c-to-ggml simple server embd-input-test gguf llama-bench
33

44
# Binaries only useful for tests
55
TEST_TARGETS = tests/test-llama-grammar tests/test-grammar-parser tests/test-double-float tests/test-grad0 tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-sampling tests/test-tokenizer-0
@@ -45,8 +45,8 @@ OPT = -Ofast
4545
else
4646
OPT = -O3
4747
endif
48-
CFLAGS = -I. $(OPT) -std=c11 -fPIC
49-
CXXFLAGS = -I. -I./examples $(OPT) -std=c++11 -fPIC
48+
CFLAGS = -I. $(OPT) -std=c11 -fPIC
49+
CXXFLAGS = -I. -I./common $(OPT) -std=c++11 -fPIC
5050
LDFLAGS =
5151

5252
ifdef LLAMA_DEBUG
@@ -329,23 +329,23 @@ ggml-alloc.o: ggml-alloc.c ggml.h ggml-alloc.h
329329

330330
OBJS += ggml-alloc.o
331331

332-
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h llama-util.h
332+
llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
333333
$(CXX) $(CXXFLAGS) -c $< -o $@
334334

335-
common.o: examples/common.cpp examples/common.h
335+
common.o: common/common.cpp common/common.h
336336
$(CXX) $(CXXFLAGS) -c $< -o $@
337337

338-
console.o: examples/console.cpp examples/console.h
338+
console.o: common/console.cpp common/console.h
339339
$(CXX) $(CXXFLAGS) -c $< -o $@
340340

341-
grammar-parser.o: examples/grammar-parser.cpp examples/grammar-parser.h
341+
grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
342342
$(CXX) $(CXXFLAGS) -c $< -o $@
343343

344344
libllama.so: llama.o ggml.o $(OBJS)
345345
$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
346346

347347
clean:
348-
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test llama-bench build-info.h $(TEST_TARGETS)
348+
rm -vf *.o *.so *.dll main quantize quantize-stats perplexity embedding benchmark-matmult save-load-state server simple vdot train-text-from-scratch convert-llama2c-to-ggml embd-input-test gguf llama-bench build-info.h $(TEST_TARGETS)
349349

350350
#
351351
# Examples
@@ -385,7 +385,10 @@ $(LIB_PRE)embdinput$(DSO_EXT): examples/embd-input/embd-input.h examples/embd-in
385385
embd-input-test: $(LIB_PRE)embdinput$(DSO_EXT) examples/embd-input/embd-input-test.cpp build-info.h ggml.o llama.o common.o $(OBJS)
386386
$(CXX) $(CXXFLAGS) $(filter-out %$(DSO_EXT),$(filter-out %.h,$(filter-out %.hpp,$^))) -o $@ $(LDFLAGS) -L. -lembdinput
387387

388-
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o $(OBJS)
388+
gguf: examples/gguf/gguf.cpp build-info.h ggml.o llama.o $(OBJS)
389+
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
390+
391+
train-text-from-scratch: examples/train-text-from-scratch/train-text-from-scratch.cpp build-info.h ggml.o llama.o common.o $(OBJS)
389392
$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
390393

391394
convert-llama2c-to-ggml: examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp build-info.h ggml.o llama.o $(OBJS)
@@ -418,7 +421,7 @@ vdot: pocs/vdot/vdot.cpp ggml.o $(OBJS)
418421
tests/test-llama-grammar: tests/test-llama-grammar.cpp build-info.h ggml.o llama.o common.o $(OBJS)
419422
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
420423

421-
tests/test-grammar-parser: tests/test-grammar-parser.cpp examples/grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
424+
tests/test-grammar-parser: tests/test-grammar-parser.cpp build-info.h ggml.o llama.o common.o $(OBJS)
422425
$(CXX) $(CXXFLAGS) $(filter-out %.txt,$^) -o $@ $(LDFLAGS)
423426

424427
tests/test-double-float: tests/test-double-float.cpp build-info.h ggml.o llama.o common.o $(OBJS)

README.md

Lines changed: 21 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -9,11 +9,17 @@
99

1010
Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
1111

12-
### 🚧 Incoming breaking change + refactoring:
12+
### Hot topics
1313

14-
See PR https://github.com/ggerganov/llama.cpp/pull/2398 for more info.
14+
A new file format has been introduced: [GGUF](https://github.com/ggerganov/llama.cpp/pull/2398)
1515

16-
To devs: avoid making big changes to `llama.h` / `llama.cpp` until merged
16+
Last revision compatible with the old format: [dadbed9](https://github.com/ggerganov/llama.cpp/commit/dadbed99e65252d79f81101a392d0d6497b86caa)
17+
18+
### Current `master` should be considered in Beta - expect some issues for a few days!
19+
20+
### Be prepared to re-convert and / or re-quantize your GGUF models while this notice is up!
21+
22+
### Issues with non-GGUF models will be considered with low priority!
1723

1824
----
1925

@@ -291,7 +297,7 @@ When built with Metal support, you can enable GPU inference with the `--gpu-laye
291297
Any value larger than 0 will offload the computation to the GPU. For example:
292298
293299
```bash
294-
./main -m ./models/7B/ggml-model-q4_0.bin -n 128 -ngl 1
300+
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128 -ngl 1
295301
```
296302
297303
### MPI Build
@@ -330,7 +336,7 @@ The above will distribute the computation across 2 processes on the first host a
330336
Finally, you're ready to run a computation using `mpirun`:
331337

332338
```bash
333-
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.bin -n 128
339+
mpirun -hostfile hostfile -n 3 ./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
334340
```
335341

336342
### BLAS Build
@@ -513,10 +519,10 @@ python3 convert.py models/7B/
513519
python convert.py models/7B/ --vocabtype bpe
514520
515521
# quantize the model to 4-bits (using q4_0 method)
516-
./quantize ./models/7B/ggml-model-f16.bin ./models/7B/ggml-model-q4_0.bin q4_0
522+
./quantize ./models/7B/ggml-model-f16.gguf ./models/7B/ggml-model-q4_0.gguf q4_0
517523
518524
# run the inference
519-
./main -m ./models/7B/ggml-model-q4_0.bin -n 128
525+
./main -m ./models/7B/ggml-model-q4_0.gguf -n 128
520526
```
521527

522528
When running the larger models, make sure you have enough disk space to store all the intermediate files.
@@ -572,7 +578,7 @@ Here is an example of a few-shot interaction, invoked with the command
572578
./examples/chat-13B.sh
573579
574580
# custom arguments using a 13B model
575-
./main -m ./models/13B/ggml-model-q4_0.bin -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
581+
./main -m ./models/13B/ggml-model-q4_0.gguf -n 256 --repeat_penalty 1.0 --color -i -r "User:" -f prompts/chat-with-bob.txt
576582
```
577583

578584
Note the use of `--color` to distinguish between user input and generated text. Other parameters are explained in more detail in the [README](examples/main/README.md) for the `main` example program.
@@ -635,6 +641,8 @@ OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It
635641
636642
### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
637643
644+
*Note: these instructions are likely obsoleted by the GGUF update*
645+
638646
- Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
639647
- Obtain the `added_tokens.json` file from Alpaca model and put it to `models`
640648
- Obtain the `gpt4all-lora-quantized.bin` file from GPT4All model and put it to `models/gpt4all-7B`
@@ -710,7 +718,7 @@ If your issue is with model generation quality, then please at least scan the fo
710718
#### How to run
711719
712720
1. Download/extract: https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-raw-v1.zip?ref=salesforce-research
713-
2. Run `./perplexity -m models/7B/ggml-model-q4_0.bin -f wiki.test.raw`
721+
2. Run `./perplexity -m models/7B/ggml-model-q4_0.gguf -f wiki.test.raw`
714722
3. Output:
715723
```
716724
perplexity : calculating perplexity over 655 chunks
@@ -809,13 +817,13 @@ docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --all-in-
809817
On completion, you are ready to play!
810818
811819
```bash
812-
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
820+
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:full --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
813821
```
814822
815823
or with a light image:
816824
817825
```bash
818-
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512
826+
docker run -v /path/to/models:/models ghcr.io/ggerganov/llama.cpp:light -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512
819827
```
820828
821829
### Docker With CUDA
@@ -846,8 +854,8 @@ The resulting images, are essentially the same as the non-CUDA images:
846854
After building locally, Usage is similar to the non-CUDA examples, but you'll need to add the `--gpus` flag. You will also want to use the `--n-gpu-layers` flag.
847855

848856
```bash
849-
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
850-
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.bin -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
857+
docker run --gpus all -v /path/to/models:/models local/llama.cpp:full-cuda --run -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
858+
docker run --gpus all -v /path/to/models:/models local/llama.cpp:light-cuda -m /models/7B/ggml-model-q4_0.gguf -p "Building a website can be done in 10 simple steps:" -n 512 --n-gpu-layers 1
851859
```
852860

853861
### Contributing

ci/run.sh

Lines changed: 22 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -159,17 +159,17 @@ function gg_run_open_llama_3b_v2 {
159159

160160
python3 ../convert.py ${path_models}
161161

162-
model_f16="${path_models}/ggml-model-f16.bin"
163-
model_q8_0="${path_models}/ggml-model-q8_0.bin"
164-
model_q4_0="${path_models}/ggml-model-q4_0.bin"
165-
model_q4_1="${path_models}/ggml-model-q4_1.bin"
166-
model_q5_0="${path_models}/ggml-model-q5_0.bin"
167-
model_q5_1="${path_models}/ggml-model-q5_1.bin"
168-
model_q2_k="${path_models}/ggml-model-q2_k.bin"
169-
model_q3_k="${path_models}/ggml-model-q3_k.bin"
170-
model_q4_k="${path_models}/ggml-model-q4_k.bin"
171-
model_q5_k="${path_models}/ggml-model-q5_k.bin"
172-
model_q6_k="${path_models}/ggml-model-q6_k.bin"
162+
model_f16="${path_models}/ggml-model-f16.gguf"
163+
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
164+
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
165+
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
166+
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
167+
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
168+
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
169+
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
170+
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
171+
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
172+
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
173173

174174
wiki_test_60="${path_wiki}/wiki.test-60.raw"
175175

@@ -285,17 +285,17 @@ function gg_run_open_llama_7b_v2 {
285285

286286
python3 ../convert.py ${path_models}
287287

288-
model_f16="${path_models}/ggml-model-f16.bin"
289-
model_q8_0="${path_models}/ggml-model-q8_0.bin"
290-
model_q4_0="${path_models}/ggml-model-q4_0.bin"
291-
model_q4_1="${path_models}/ggml-model-q4_1.bin"
292-
model_q5_0="${path_models}/ggml-model-q5_0.bin"
293-
model_q5_1="${path_models}/ggml-model-q5_1.bin"
294-
model_q2_k="${path_models}/ggml-model-q2_k.bin"
295-
model_q3_k="${path_models}/ggml-model-q3_k.bin"
296-
model_q4_k="${path_models}/ggml-model-q4_k.bin"
297-
model_q5_k="${path_models}/ggml-model-q5_k.bin"
298-
model_q6_k="${path_models}/ggml-model-q6_k.bin"
288+
model_f16="${path_models}/ggml-model-f16.gguf"
289+
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
290+
model_q4_0="${path_models}/ggml-model-q4_0.gguf"
291+
model_q4_1="${path_models}/ggml-model-q4_1.gguf"
292+
model_q5_0="${path_models}/ggml-model-q5_0.gguf"
293+
model_q5_1="${path_models}/ggml-model-q5_1.gguf"
294+
model_q2_k="${path_models}/ggml-model-q2_k.gguf"
295+
model_q3_k="${path_models}/ggml-model-q3_k.gguf"
296+
model_q4_k="${path_models}/ggml-model-q4_k.gguf"
297+
model_q5_k="${path_models}/ggml-model-q5_k.gguf"
298+
model_q6_k="${path_models}/ggml-model-q6_k.gguf"
299299

300300
wiki_test="${path_wiki}/wiki.test.raw"
301301

common/CMakeLists.txt

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
# common
2+
3+
set(TARGET common)
4+
5+
add_library(${TARGET} OBJECT
6+
common.h
7+
common.cpp
8+
console.h
9+
console.cpp
10+
grammar-parser.h
11+
grammar-parser.cpp
12+
)
13+
14+
if (BUILD_SHARED_LIBS)
15+
set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
16+
endif()
17+
18+
target_include_directories(${TARGET} PUBLIC .)
19+
target_compile_features(${TARGET} PUBLIC cxx_std_11)
20+
target_link_libraries(${TARGET} PRIVATE llama)

0 commit comments

Comments
 (0)