Skip to content

Commit 9566de9

Browse files
committed
Merge branch 'master' into hkvc_chat_interactivespecials
Merge master has of 20240510IST1236 into this branch. Fix a merge conflict with the newly added conversation flag in master branch.
2 parents 76730e1 + d11afd6 commit 9566de9

File tree

91 files changed

+55405
-42039
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

91 files changed

+55405
-42039
lines changed

CMakeLists.txt

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
103103
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
104104
"llama: max. batch size for using peer access")
105105
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
106+
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
107+
106108
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
107109
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
108110
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -403,12 +405,16 @@ if (LLAMA_CUDA)
403405
list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")
404406

405407
add_compile_definitions(GGML_USE_CUDA)
408+
add_compile_definitions(GGML_CUDA_USE_GRAPHS)
406409
if (LLAMA_CUDA_FORCE_DMMV)
407410
add_compile_definitions(GGML_CUDA_FORCE_DMMV)
408411
endif()
409412
if (LLAMA_CUDA_FORCE_MMQ)
410413
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
411414
endif()
415+
if (LLAMA_CUDA_NO_VMM)
416+
add_compile_definitions(GGML_CUDA_NO_VMM)
417+
endif()
412418
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
413419
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
414420
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -425,7 +431,7 @@ if (LLAMA_CUDA)
425431

426432
if (LLAMA_STATIC)
427433
if (WIN32)
428-
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
434+
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
429435
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
430436
else ()
431437
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -434,7 +440,11 @@ if (LLAMA_CUDA)
434440
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
435441
endif()
436442

437-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
443+
if (LLAMA_CUDA_NO_VMM)
444+
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
445+
else()
446+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
447+
endif()
438448

439449
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
440450
# 52 == lowest CUDA 12 standard

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -433,7 +433,7 @@ ifdef LLAMA_CUDA
433433
else
434434
CUDA_PATH ?= /usr/local/cuda
435435
endif
436-
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include
436+
MK_CPPFLAGS += -DGGML_USE_CUDA -I$(CUDA_PATH)/include -I$(CUDA_PATH)/targets/$(UNAME_M)-linux/include -DGGML_CUDA_USE_GRAPHS
437437
MK_LDFLAGS += -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L$(CUDA_PATH)/lib64 -L/usr/lib64 -L$(CUDA_PATH)/targets/$(UNAME_M)-linux/lib -L/usr/lib/wsl/lib
438438
OBJS += ggml-cuda.o
439439
OBJS += $(patsubst %.cu,%.o,$(wildcard ggml-cuda/*.cu))

README.md

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
44

5-
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
5+
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
66

77
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
88

@@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
2020

2121
### Hot topics
2222

23-
- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
23+
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
24+
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
2425
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
2526
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
2627
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
@@ -175,6 +176,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
175176
- [nat/openplayground](https://github.com/nat/openplayground)
176177
- [Faraday](https://faraday.dev/) (proprietary)
177178
- [LMStudio](https://lmstudio.ai/) (proprietary)
179+
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
178180
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
179181
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
180182
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)
@@ -712,7 +714,7 @@ Building the program with BLAS support may lead to some performance improvements
712714
713715
To obtain the official LLaMA 2 weights please see the <a href="#obtaining-and-using-the-facebook-llama-2-model">Obtaining and using the Facebook LLaMA 2 model</a> section. There is also a large selection of pre-quantized `gguf` models available on Hugging Face.
714716
715-
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
717+
Note: `convert.py` does not support LLaMA 3, you can use `convert-hf-to-gguf.py` with LLaMA 3 downloaded from Hugging Face.
716718
717719
```bash
718720
# obtain the official LLaMA model weights and place them in ./models
@@ -935,25 +937,35 @@ If your issue is with model generation quality, then please at least scan the fo
935937
936938
### Android
937939
938-
#### Building the Project using Android NDK
939-
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
940+
#### Build on Android using Termux
941+
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
942+
```
943+
apt update && apt upgrade -y
944+
apt install git make cmake
945+
```
940946
941-
First, install the essential packages for termux:
947+
It's recommended to move your model inside the `~/` directory for best performance:
942948
```
943-
pkg install clang wget git cmake
949+
cd storage/downloads
950+
mv model.gguf ~/
944951
```
945-
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
946952
947-
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
953+
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
954+
955+
#### Building the Project using Android NDK
956+
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
948957
958+
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
949959
```
950960
$ mkdir build-android
951961
$ cd build-android
952962
$ export NDK=<your_ndk_directory>
953963
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
954964
$ make
955965
```
956-
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
966+
967+
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
968+
957969
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
958970
959971
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@@ -975,25 +987,10 @@ $cd /data/data/com.termux/files/home/bin
975987
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
976988
```
977989
978-
Here is a demo of an interactive session running on Pixel 5 phone:
990+
Here's a demo of an interactive session running on Pixel 5 phone:
979991
980992
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
981993
982-
#### Build on Android using Termux
983-
[Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required).
984-
```
985-
apt update && apt upgrade -y
986-
apt install git
987-
```
988-
989-
It's recommended to move your model inside the `~/` directory for best performance:
990-
```
991-
cd storage/downloads
992-
mv model.gguf ~/
993-
```
994-
995-
[Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
996-
997994
### Docker
998995
999996
#### Prerequisites

ci/run.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {
160160

161161
set -e
162162

163-
# TODO: too slow, run on dedicated node
164-
#(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
165-
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
163+
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
164+
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
166165

167166
set +e
168167
}
@@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release
695694
if [ -z ${GG_BUILD_LOW_PERF} ]; then
696695
test $ret -eq 0 && gg_run embd_bge_small
697696

698-
test $ret -eq 0 && gg_run test_scripts_debug
699-
test $ret -eq 0 && gg_run test_scripts_release
697+
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
698+
test $ret -eq 0 && gg_run test_scripts_debug
699+
test $ret -eq 0 && gg_run test_scripts_release
700+
fi
700701

701702
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
702703
if [ -z ${GG_BUILD_CUDA} ]; then

common/common.cpp

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
#include "common.h"
2+
// Change JSON_ASSERT from assert() to GGML_ASSERT:
3+
#define JSON_ASSERT GGML_ASSERT
24
#include "json.hpp"
35
#include "json-schema-to-grammar.h"
46
#include "llama.h"
@@ -915,6 +917,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
915917
params.instruct = true;
916918
return true;
917919
}
920+
if (arg == "-cnv" || arg == "--conversation") {
921+
params.conversation = true;
922+
return true;
923+
}
918924
if (arg == "-cml" || arg == "--chatml") {
919925
params.chatml = true;
920926
return true;
@@ -1422,6 +1428,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
14221428
printf(" -i, --interactive run in interactive mode\n");
14231429
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
14241430
printf(" --interactive-first run in interactive mode and wait for input right away\n");
1431+
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
14251432
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
14261433
printf(" -cml, --chatml run in chatml mode (use with ChatML-compatible models)\n");
14271434
printf(" --multiline-input allows you to write or paste multiple lines without ending each in '\\'\n");
@@ -1969,18 +1976,18 @@ static bool llama_download_file(const std::string & url, const std::string & pat
19691976
try {
19701977
metadata_in >> metadata;
19711978
fprintf(stderr, "%s: previous metadata file found %s: %s\n", __func__, metadata_path.c_str(), metadata.dump().c_str());
1972-
if (metadata.contains("url") && metadata["url"].is_string()) {
1973-
auto previous_url = metadata["url"].get<std::string>();
1979+
if (metadata.contains("url") && metadata.at("url").is_string()) {
1980+
auto previous_url = metadata.at("url").get<std::string>();
19741981
if (previous_url != url) {
19751982
fprintf(stderr, "%s: Model URL mismatch: %s != %s\n", __func__, url.c_str(), previous_url.c_str());
19761983
return false;
19771984
}
19781985
}
1979-
if (metadata.contains("etag") && metadata["etag"].is_string()) {
1980-
etag = metadata["etag"];
1986+
if (metadata.contains("etag") && metadata.at("etag").is_string()) {
1987+
etag = metadata.at("etag");
19811988
}
1982-
if (metadata.contains("lastModified") && metadata["lastModified"].is_string()) {
1983-
last_modified = metadata["lastModified"];
1989+
if (metadata.contains("lastModified") && metadata.at("lastModified").is_string()) {
1990+
last_modified = metadata.at("lastModified");
19841991
}
19851992
} catch (const nlohmann::json::exception & e) {
19861993
fprintf(stderr, "%s: error reading metadata file %s: %s\n", __func__, metadata_path.c_str(), e.what());

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,7 @@ struct gpt_params {
141141
bool use_color = false; // use color to distinguish generations and inputs
142142
bool interactive = false; // interactive mode
143143
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
144+
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
144145
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
145146
bool prompt_cache_all = false; // save user input and generations to prompt cache
146147
bool prompt_cache_ro = false; // open the prompt cache read-only and do not update it

common/json-schema-to-grammar.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
11
#pragma once
2+
3+
#include "ggml.h"
4+
// Change JSON_ASSERT from assert() to GGML_ASSERT:
5+
#define JSON_ASSERT GGML_ASSERT
26
#include "json.hpp"
37

48
std::string json_schema_to_grammar(const nlohmann::ordered_json& schema);

common/sampling.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
3535

3636
result->prev.resize(params.n_prev);
3737

38+
result->n_considered = 0;
39+
3840
llama_sampling_set_rng_seed(result, params.seed);
3941

4042
return result;
@@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
6466

6567
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
6668
ctx->cur.clear();
69+
ctx->n_considered = 0;
6770
}
6871

6972
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -253,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
253256
}
254257
}
255258

259+
ctx_sampling->n_considered = cur_p.size;
260+
256261
return id;
257262
}
258263

common/sampling.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ struct llama_sampling_context {
8181
// TODO: replace with ring-buffer
8282
std::vector<llama_token> prev;
8383
std::vector<llama_token_data> cur;
84+
size_t n_considered;
8485

8586
std::mt19937 rng;
8687
};

convert-hf-to-gguf-update.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,10 @@ class TOKENIZER_TYPE(IntEnum):
4949

5050
if len(sys.argv) == 2:
5151
token = sys.argv[1]
52+
if not token.startswith("hf_"):
53+
logger.info("Huggingface token seems invalid")
54+
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
55+
sys.exit(1)
5256
else:
5357
logger.info("Usage: python convert-hf-to-gguf-update.py <huggingface_token>")
5458
sys.exit(1)
@@ -67,6 +71,9 @@ class TOKENIZER_TYPE(IntEnum):
6771
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
6872
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
6973
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
74+
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
75+
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
76+
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
7077
]
7178

7279
# make directory "models/tokenizers" if it doesn't exist
@@ -150,6 +157,8 @@ def download_file_with_auth(url, token, save_path):
150157
# print the "pre_tokenizer" content from the tokenizer.json
151158
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
152159
cfg = json.load(f)
160+
normalizer = cfg["normalizer"]
161+
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
153162
pre_tokenizer = cfg["pre_tokenizer"]
154163
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
155164

@@ -252,6 +261,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
252261
"3333333",
253262
"33333333",
254263
"333333333",
264+
# "Cửa Việt", # llama-bpe fails on this
255265
chktxt,
256266
]
257267

0 commit comments

Comments
 (0)