Skip to content

Commit e59b546

Browse files
authored
Merge branch 'master' into feat-jina-embeddings
2 parents cf9fcd8 + 911b390 commit e59b546

40 files changed

+1650
-227
lines changed

CMakeLists.txt

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
103103
set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
104104
"llama: max. batch size for using peer access")
105105
option(LLAMA_CUDA_NO_PEER_COPY "llama: do not use peer to peer copies" OFF)
106+
option(LLAMA_CUDA_NO_VMM "llama: do not try to use CUDA VMM" OFF)
107+
106108
option(LLAMA_CURL "llama: use libcurl to download model from an URL" OFF)
107109
option(LLAMA_HIPBLAS "llama: use hipBLAS" OFF)
108110
option(LLAMA_HIP_UMA "llama: use HIP unified memory architecture" OFF)
@@ -409,6 +411,9 @@ if (LLAMA_CUDA)
409411
if (LLAMA_CUDA_FORCE_MMQ)
410412
add_compile_definitions(GGML_CUDA_FORCE_MMQ)
411413
endif()
414+
if (LLAMA_CUDA_NO_VMM)
415+
add_compile_definitions(GGML_CUDA_NO_VMM)
416+
endif()
412417
add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
413418
add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
414419
if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -434,7 +439,11 @@ if (LLAMA_CUDA)
434439
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
435440
endif()
436441

437-
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
442+
if (LLAMA_CUDA_NO_VMM)
443+
# No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
444+
else()
445+
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
446+
endif()
438447

439448
if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
440449
# 52 == lowest CUDA 12 standard

README.md

Lines changed: 20 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
2020

2121
### Hot topics
2222

23-
- **BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920**
23+
- **Initial Flash-Attention support: https://github.com/ggerganov/llama.cpp/pull/5021**
24+
- BPE pre-tokenization support has been added: https://github.com/ggerganov/llama.cpp/pull/6920
2425
- MoE memory layout has been updated - reconvert models for `mmap` support and regenerate `imatrix` https://github.com/ggerganov/llama.cpp/pull/6387
2526
- Model sharding instructions using `gguf-split` https://github.com/ggerganov/llama.cpp/discussions/6404
2627
- Fix major bug in Metal batched inference https://github.com/ggerganov/llama.cpp/pull/6225
@@ -935,25 +936,35 @@ If your issue is with model generation quality, then please at least scan the fo
935936
936937
### Android
937938
938-
#### Building the Project using Android NDK
939-
You can easily run `llama.cpp` on Android device with [termux](https://termux.dev/).
939+
#### Build on Android using Termux
940+
[Termux](https://github.com/termux/termux-app#installation) is a method to execute `llama.cpp` on an Android device (no root required).
941+
```
942+
apt update && apt upgrade -y
943+
apt install git make cmake
944+
```
940945
941-
First, install the essential packages for termux:
946+
It's recommended to move your model inside the `~/` directory for best performance:
942947
```
943-
pkg install clang wget git cmake
948+
cd storage/downloads
949+
mv model.gguf ~/
944950
```
945-
Second, obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake:
946951
947-
You can execute the following commands on your computer to avoid downloading the NDK to your mobile. Of course, you can also do this in Termux.
952+
[Get the code](https://github.com/ggerganov/llama.cpp#get-the-code) & [follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
953+
954+
#### Building the Project using Android NDK
955+
Obtain the [Android NDK](https://developer.android.com/ndk) and then build with CMake.
948956
957+
Execute the following commands on your computer to avoid downloading the NDK to your mobile. Alternatively, you can also do this in Termux:
949958
```
950959
$ mkdir build-android
951960
$ cd build-android
952961
$ export NDK=<your_ndk_directory>
953962
$ cmake -DCMAKE_TOOLCHAIN_FILE=$NDK/build/cmake/android.toolchain.cmake -DANDROID_ABI=arm64-v8a -DANDROID_PLATFORM=android-23 -DCMAKE_C_FLAGS=-march=armv8.4a+dotprod ..
954963
$ make
955964
```
956-
Install [termux](https://termux.dev/) on your device and run `termux-setup-storage` to get access to your SD card.
965+
966+
Install [termux](https://github.com/termux/termux-app#installation) on your device and run `termux-setup-storage` to get access to your SD card (if Android 11+ then run the command twice).
967+
957968
Finally, copy these built `llama` binaries and the model file to your device storage. Because the file permissions in the Android sdcard cannot be changed, you can copy the executable files to the `/data/data/com.termux/files/home/bin` path, and then execute the following commands in Termux to add executable permission:
958969
959970
(Assumed that you have pushed the built executable files to the /sdcard/llama.cpp/bin path using `adb push`)
@@ -975,25 +986,10 @@ $cd /data/data/com.termux/files/home/bin
975986
$./main -m ../model/llama-2-7b-chat.Q4_K_M.gguf -n 128 -cml
976987
```
977988
978-
Here is a demo of an interactive session running on Pixel 5 phone:
989+
Here's a demo of an interactive session running on Pixel 5 phone:
979990
980991
https://user-images.githubusercontent.com/271616/225014776-1d567049-ad71-4ef2-b050-55b0b3b9274c.mp4
981992
982-
#### Build on Android using Termux
983-
[Termux](https://github.com/termux/termux-app#installation) is an alternative to execute `llama.cpp` on an Android device (no root required).
984-
```
985-
apt update && apt upgrade -y
986-
apt install git
987-
```
988-
989-
It's recommended to move your model inside the `~/` directory for best performance:
990-
```
991-
cd storage/downloads
992-
mv model.gguf ~/
993-
```
994-
995-
[Follow the Linux build instructions](https://github.com/ggerganov/llama.cpp#build) to build `llama.cpp`.
996-
997993
### Docker
998994
999995
#### Prerequisites

ci/run.sh

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -160,9 +160,8 @@ function gg_run_test_scripts_debug {
160160

161161
set -e
162162

163-
# TODO: too slow, run on dedicated node
164-
#(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
165-
#(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
163+
(cd ./examples/gguf-split && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
164+
(cd ./examples/quantize && time bash tests.sh "$SRC/build-ci-debug/bin" "$MNT/models") 2>&1 | tee -a $OUT/${ci}-scripts.log
166165

167166
set +e
168167
}
@@ -695,8 +694,10 @@ test $ret -eq 0 && gg_run ctest_release
695694
if [ -z ${GG_BUILD_LOW_PERF} ]; then
696695
test $ret -eq 0 && gg_run embd_bge_small
697696

698-
test $ret -eq 0 && gg_run test_scripts_debug
699-
test $ret -eq 0 && gg_run test_scripts_release
697+
if [ -z ${GG_BUILD_CLOUD} ] || [ ${GG_BUILD_EXTRA_TESTS_0} ]; then
698+
test $ret -eq 0 && gg_run test_scripts_debug
699+
test $ret -eq 0 && gg_run test_scripts_release
700+
fi
700701

701702
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
702703
if [ -z ${GG_BUILD_CUDA} ]; then

common/sampling.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
3535

3636
result->prev.resize(params.n_prev);
3737

38+
result->n_considered = 0;
39+
3840
llama_sampling_set_rng_seed(result, params.seed);
3941

4042
return result;
@@ -64,6 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
6466

6567
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
6668
ctx->cur.clear();
69+
ctx->n_considered = 0;
6770
}
6871

6972
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -253,6 +256,8 @@ static llama_token llama_sampling_sample_impl(
253256
}
254257
}
255258

259+
ctx_sampling->n_considered = cur_p.size;
260+
256261
return id;
257262
}
258263

common/sampling.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ struct llama_sampling_context {
8181
// TODO: replace with ring-buffer
8282
std::vector<llama_token> prev;
8383
std::vector<llama_token_data> cur;
84+
size_t n_considered;
8485

8586
std::mt19937 rng;
8687
};

convert-hf-to-gguf-update.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ class TOKENIZER_TYPE(IntEnum):
6767
{"name": "gpt-2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
6868
{"name": "refact", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
6969
{"name": "command-r", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
70+
{"name": "qwen2", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
71+
{"name": "olmo", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
72+
{"name": "dbrx", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
7073
{"name": "jina-embeddings-v2-base-es", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
7174
{"name": "jina-embeddings-v2-base-de", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
7275
]
@@ -152,6 +155,8 @@ def download_file_with_auth(url, token, save_path):
152155
# print the "pre_tokenizer" content from the tokenizer.json
153156
with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
154157
cfg = json.load(f)
158+
normalizer = cfg["normalizer"]
159+
logger.info("normalizer: " + json.dumps(normalizer, indent=4))
155160
pre_tokenizer = cfg["pre_tokenizer"]
156161
logger.info("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
157162

convert-hf-to-gguf.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -315,6 +315,15 @@ def get_vocab_base_pre(self, tokenizer) -> str:
315315
if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
316316
# ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
317317
res = "command-r"
318+
if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
319+
# ref: https://huggingface.co/Qwen/Qwen1.5-7B
320+
res = "qwen2"
321+
if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
322+
# ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
323+
res = "olmo"
324+
if chkhsh == "a8594e3edff7c29c003940395316294b2c623e09894deebbc65f33f1515df79e":
325+
# ref: https://huggingface.co/databricks/dbrx-instruct
326+
res = "dbrx"
318327
if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
319328
res = "default"
320329
if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
@@ -2837,8 +2846,9 @@ class OlmoModel(Model):
28372846
def set_gguf_parameters(self):
28382847
super().set_gguf_parameters()
28392848
self.gguf_writer.add_layer_norm_eps(1e-5)
2840-
if "clip_qkv" in self.hparams is not None:
2841-
self.gguf_writer.add_clamp_kqv(self.hparams["clip_qkv"])
2849+
clip_qkv = self.hparams.get("clip_qkv")
2850+
if clip_qkv is not None:
2851+
self.gguf_writer.add_clamp_kqv(clip_qkv)
28422852

28432853
# Same as super class, but permuting q_proj, k_proj
28442854
# Copied from: LlamaModel

convert.py

Lines changed: 32 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1508,25 +1508,27 @@ def main(args_in: list[str] | None = None) -> None:
15081508
if args.big_endian:
15091509
endianess = gguf.GGUFEndian.BIG
15101510

1511-
params = Params.load(model_plus)
1512-
if params.n_ctx == -1:
1513-
if args.ctx is None:
1514-
msg = """\
1515-
The model doesn't have a context size, and you didn't specify one with --ctx
1516-
Please specify one with --ctx:
1517-
- LLaMA v1: --ctx 2048
1518-
- LLaMA v2: --ctx 4096"""
1519-
parser.error(textwrap.dedent(msg))
1520-
params.n_ctx = args.ctx
1521-
1522-
if args.outtype:
1523-
params.ftype = {
1524-
"f32": GGMLFileType.AllF32,
1525-
"f16": GGMLFileType.MostlyF16,
1526-
"q8_0": GGMLFileType.MostlyQ8_0,
1527-
}[args.outtype]
1528-
1529-
logger.info(f"params = {params}")
1511+
params = None
1512+
if args.pad_vocab or not args.vocab_only:
1513+
params = Params.load(model_plus)
1514+
if params.n_ctx == -1:
1515+
if args.ctx is None:
1516+
msg = """\
1517+
The model doesn't have a context size, and you didn't specify one with --ctx
1518+
Please specify one with --ctx:
1519+
- LLaMA v1: --ctx 2048
1520+
- LLaMA v2: --ctx 4096"""
1521+
parser.error(textwrap.dedent(msg))
1522+
params.n_ctx = args.ctx
1523+
1524+
if args.outtype:
1525+
params.ftype = {
1526+
"f32": GGMLFileType.AllF32,
1527+
"f16": GGMLFileType.MostlyF16,
1528+
"q8_0": GGMLFileType.MostlyQ8_0,
1529+
}[args.outtype]
1530+
1531+
logger.info(f"params = {params}")
15301532

15311533
model_parent_path = model_plus.paths[0].parent
15321534
vocab_path = Path(args.vocab_dir or args.model or model_parent_path)
@@ -1539,6 +1541,17 @@ def main(args_in: list[str] | None = None) -> None:
15391541
if not args.outfile:
15401542
raise ValueError("need --outfile if using --vocab-only")
15411543
outfile = args.outfile
1544+
if params is None:
1545+
params = Params(
1546+
n_vocab = vocab.vocab_size,
1547+
n_embd = 1,
1548+
n_layer = 1,
1549+
n_ctx = 1,
1550+
n_ff = 1,
1551+
n_head = 1,
1552+
n_head_kv = 1,
1553+
f_norm_eps = 1e-5,
1554+
)
15421555
OutputFile.write_vocab_only(outfile, params, vocab, special_vocab,
15431556
endianess=endianess, pad_vocab=args.pad_vocab)
15441557
logger.info(f"Wrote {outfile}")

docs/BLIS.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ Install BLIS:
2323
sudo make install
2424
```
2525

26-
We recommend using openmp since it's easier to modify the cores been used.
26+
We recommend using openmp since it's easier to modify the cores being used.
2727

2828
### llama.cpp compilation
2929

docs/HOWTO-add-model.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,9 @@ NOTE: The dimensions in `ggml` are typically in the reverse order of the `pytorc
9696

9797
This is the funniest part, you have to provide the inference graph implementation of the new model architecture in `llama_build_graph`.
9898

99-
Have a look to existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
99+
Have a look at existing implementation like `build_llama`, `build_dbrx` or `build_bert`.
100100

101-
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support of missing backend operations can be added in another PR.
101+
When implementing a new graph, please note that the underlying `ggml` backends might not support them all, support for missing backend operations can be added in another PR.
102102

103103
Note: to debug the inference graph: you can use [eval-callback](../examples/eval-callback).
104104

examples/finetune/finetune.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -575,7 +575,7 @@ static struct ggml_tensor * llama_build_lora_finetune_graphs(
575575
GGML_ASSERT(tokens_input->type == GGML_TYPE_I32);
576576

577577
auto add_to_f32 = [] (struct ggml_context * ctx, struct ggml_tensor * a, struct ggml_tensor * b) {
578-
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16) {
578+
if (ggml_is_quantized(a->type) || a->type == GGML_TYPE_F16 || a->type == GGML_TYPE_BF16) {
579579
return ggml_add_cast(ctx, a, b, GGML_TYPE_F32);
580580
} else if (a->type == GGML_TYPE_F32) {
581581
return ggml_add(ctx, a, b);

0 commit comments

Comments
 (0)