Skip to content

Commit fc0007e

Browse files
committed
Merge branch 'master' into add-stablelm-hash
2 parents bc924e0 + cbf7589 commit fc0007e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

70 files changed

+52547
-42043
lines changed

.github/workflows/build.yml

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -898,9 +898,9 @@ jobs:
898898
shell: bash
899899

900900
env:
901-
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/62641e01-1e8d-4ace-91d6-ae03f7f8a71f/w_BaseKit_p_2024.0.0.49563_offline.exe
901+
WINDOWS_BASEKIT_URL: https://registrationcenter-download.intel.com/akdlm/IRC_NAS/7dff44ba-e3af-4448-841c-0d616c8da6e7/w_BaseKit_p_2024.1.0.595_offline.exe
902902
WINDOWS_DPCPP_MKL: intel.oneapi.win.cpp-dpcpp-common:intel.oneapi.win.mkl.devel
903-
903+
ONEAPI_ROOT: "C:/Program Files (x86)/Intel/oneAPI"
904904
steps:
905905
- name: Clone
906906
id: checkout
@@ -932,6 +932,17 @@ jobs:
932932
id: pack_artifacts
933933
if: ${{ ( github.event_name == 'push' && github.ref == 'refs/heads/master' ) || github.event.inputs.create_release == 'true' }}
934934
run: |
935+
echo "cp oneAPI running time dll files in ${{ env.ONEAPI_ROOT }} to ./build/bin"
936+
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_sycl_blas.4.dll" ./build/bin
937+
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_core.2.dll" ./build/bin
938+
cp "${{ env.ONEAPI_ROOT }}/mkl/latest/bin/mkl_tbb_thread.2.dll" ./build/bin
939+
940+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_win_proxy_loader.dll" ./build/bin
941+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/pi_level_zero.dll" ./build/bin
942+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/sycl7.dll" ./build/bin
943+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/svml_dispmd.dll" ./build/bin
944+
cp "${{ env.ONEAPI_ROOT }}/compiler/latest/bin/libmmd.dll" ./build/bin
945+
echo "cp oneAPI running time dll files to ./build/bin done"
935946
7z a llama-${{ steps.tag.outputs.name }}-bin-win-sycl-x64.zip ./build/bin/*
936947
937948
- name: Upload artifacts

CMakeLists.txt

Lines changed: 2 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -296,7 +296,7 @@ if (LLAMA_BLAS)
296296
if (LLAMA_STATIC)
297297
set(BLA_STATIC ON)
298298
endif()
299-
if ($(CMAKE_VERSION) VERSION_GREATER_EQUAL 3.22)
299+
if (CMAKE_VERSION VERSION_GREATER_EQUAL 3.22)
300300
set(BLA_SIZEOF_INTEGER 8)
301301
endif()
302302

@@ -431,7 +431,7 @@ if (LLAMA_CUDA)
431431

432432
if (LLAMA_STATIC)
433433
if (WIN32)
434-
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
434+
# As of 12.3.1 CUDA Toolkit for Windows does not offer a static cublas library
435435
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas CUDA::cublasLt)
436436
else ()
437437
set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
@@ -1281,17 +1281,6 @@ install(
12811281
WORLD_READ
12821282
WORLD_EXECUTE
12831283
DESTINATION ${CMAKE_INSTALL_BINDIR})
1284-
install(
1285-
FILES convert-lora-to-ggml.py
1286-
PERMISSIONS
1287-
OWNER_READ
1288-
OWNER_WRITE
1289-
OWNER_EXECUTE
1290-
GROUP_READ
1291-
GROUP_EXECUTE
1292-
WORLD_READ
1293-
WORLD_EXECUTE
1294-
DESTINATION ${CMAKE_INSTALL_BINDIR})
12951284
if (LLAMA_METAL)
12961285
install(
12971286
FILES ggml-metal.metal

README.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
![llama](https://user-images.githubusercontent.com/1991296/230134379-7181e485-c521-4d23-a0d6-f7b3b61ba524.png)
44

5-
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
5+
[![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT) [![Server](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml/badge.svg?branch=master&event=schedule)](https://github.com/ggerganov/llama.cpp/actions/workflows/server.yml)
66

77
[Roadmap](https://github.com/users/ggerganov/projects/7) / [Project status](https://github.com/ggerganov/llama.cpp/discussions/3471) / [Manifesto](https://github.com/ggerganov/llama.cpp/discussions/205) / [ggml](https://github.com/ggerganov/ggml)
88

@@ -140,6 +140,7 @@ Typically finetunes of the base models below are supported as well.
140140
- [x] [MobileVLM 1.7B/3B models](https://huggingface.co/models?search=mobileVLM)
141141
- [x] [Yi-VL](https://huggingface.co/models?search=Yi-VL)
142142
- [x] [Mini CPM](https://huggingface.co/models?search=MiniCPM)
143+
- [x] [Moondream](https://huggingface.co/vikhyatk/moondream2)
143144

144145
**HTTP server**
145146

@@ -175,6 +176,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
175176
- [nat/openplayground](https://github.com/nat/openplayground)
176177
- [Faraday](https://faraday.dev/) (proprietary)
177178
- [LMStudio](https://lmstudio.ai/) (proprietary)
179+
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
178180
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
179181
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
180182
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)

ci/run.sh

Lines changed: 0 additions & 95 deletions
Original file line numberDiff line numberDiff line change
@@ -365,47 +365,6 @@ function gg_run_open_llama_3b_v2 {
365365

366366
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
367367

368-
# lora
369-
function compare_ppl {
370-
qnt="$1"
371-
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
372-
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
373-
374-
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
375-
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
376-
return 20
377-
fi
378-
379-
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
380-
return 0
381-
}
382-
383-
path_lora="../models-mnt/open-llama/3B-v2/lora"
384-
path_shakespeare="../models-mnt/shakespeare"
385-
386-
shakespeare="${path_shakespeare}/shakespeare.txt"
387-
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
388-
389-
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_config.json
390-
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/adapter_model.bin
391-
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_3b_v2_shakespeare_lora/resolve/main/shakespeare.txt
392-
393-
python3 ../convert-lora-to-ggml.py ${path_lora}
394-
395-
# f16
396-
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
397-
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
398-
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
399-
400-
# q8_0
401-
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
402-
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
403-
compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
404-
405-
# q8_0 + f16 lora-base
406-
(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
407-
compare_ppl "q8_0 / f16 base shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
408-
409368
set +e
410369
}
411370

@@ -416,7 +375,6 @@ function gg_sum_open_llama_3b_v2 {
416375
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
417376
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
418377
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
419-
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
420378
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
421379
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
422380
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -429,11 +387,6 @@ function gg_sum_open_llama_3b_v2 {
429387
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
430388
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
431389
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
432-
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
433-
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
434-
gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
435-
gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
436-
gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
437390
}
438391

439392
# open_llama_7b_v2
@@ -549,48 +502,6 @@ function gg_run_open_llama_7b_v2 {
549502

550503
cat $OUT/${ci}-imatrix.log | grep "Final" >> $OUT/${ci}-imatrix-sum.log
551504

552-
# lora
553-
function compare_ppl {
554-
qnt="$1"
555-
ppl1=$(echo "$2" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
556-
ppl2=$(echo "$3" | grep -oE "[0-9]+\.[0-9]+" | tail -n 1)
557-
558-
if [ $(echo "$ppl1 < $ppl2" | bc) -eq 1 ]; then
559-
printf ' - %s @ %s (FAIL: %s > %s)\n' "$qnt" "$ppl" "$ppl1" "$ppl2"
560-
return 20
561-
fi
562-
563-
printf ' - %s @ %s %s OK\n' "$qnt" "$ppl1" "$ppl2"
564-
return 0
565-
}
566-
567-
path_lora="../models-mnt/open-llama/7B-v2/lora"
568-
path_shakespeare="../models-mnt/shakespeare"
569-
570-
shakespeare="${path_shakespeare}/shakespeare.txt"
571-
lora_shakespeare="${path_lora}/ggml-adapter-model.bin"
572-
573-
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_config.json
574-
gg_wget ${path_lora} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/adapter_model.bin
575-
gg_wget ${path_shakespeare} https://huggingface.co/slaren/open_llama_7b_v2_shakespeare_lora/resolve/main/shakespeare.txt
576-
577-
python3 ../convert-lora-to-ggml.py ${path_lora}
578-
579-
# f16
580-
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-f16.log
581-
(time ./bin/perplexity --model ${model_f16} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-f16.log
582-
compare_ppl "f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-f16.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
583-
584-
# currently not supported by the CUDA backend
585-
# q8_0
586-
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-q8_0.log
587-
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0.log
588-
#compare_ppl "q8_0 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
589-
590-
# q8_0 + f16 lora-base
591-
#(time ./bin/perplexity --model ${model_q8_0} -f ${shakespeare} --lora ${lora_shakespeare} --lora-base ${model_f16} -t 1 -ngl 999 -c 2048 -b 512 --chunks 3 ) 2>&1 | tee -a $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log
592-
#compare_ppl "q8_0 / f16 shakespeare" "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log | grep "^\[1\]")" "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log | grep "^\[1\]")" | tee -a $OUT/${ci}-lora-ppl.log
593-
594505
set +e
595506
}
596507

@@ -601,7 +512,6 @@ function gg_sum_open_llama_7b_v2 {
601512
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
602513
gg_printf '- perplexity:\n%s\n' "$(cat $OUT/${ci}-ppl.log)"
603514
gg_printf '- imatrix:\n```\n%s\n```\n' "$(cat $OUT/${ci}-imatrix-sum.log)"
604-
gg_printf '- lora:\n%s\n' "$(cat $OUT/${ci}-lora-ppl.log)"
605515
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
606516
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
607517
gg_printf '- q4_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q4_0.log)"
@@ -614,11 +524,6 @@ function gg_sum_open_llama_7b_v2 {
614524
gg_printf '- q5_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q5_k.log)"
615525
gg_printf '- q6_k:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q6_k.log)"
616526
gg_printf '- save-load-state: \n```\n%s\n```\n' "$(cat $OUT/${ci}-save-load-state.log)"
617-
gg_printf '- shakespeare (f16):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-f16.log)"
618-
gg_printf '- shakespeare (f16 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-f16.log)"
619-
#gg_printf '- shakespeare (q8_0):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-q8_0.log)"
620-
#gg_printf '- shakespeare (q8_0 lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0.log)"
621-
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
622527
}
623528

624529
# bge-small

common/common.cpp

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -901,6 +901,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
901901
params.interactive = true;
902902
return true;
903903
}
904+
if (arg == "--interactive-specials") {
905+
params.interactive_specials = true;
906+
return true;
907+
}
904908
if (arg == "--embedding") {
905909
params.embedding = true;
906910
return true;
@@ -1367,14 +1371,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
13671371
if (arg.compare(0, arg_prefix.size(), arg_prefix) == 0) {
13681372
std::replace(arg.begin(), arg.end(), '_', '-');
13691373
}
1370-
13711374
if (!gpt_params_find_arg(argc, argv, arg, params, i, invalid_param)) {
13721375
throw std::invalid_argument("error: unknown argument: " + arg);
13731376
}
1374-
}
1375-
1376-
if (invalid_param) {
1377-
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1377+
if (invalid_param) {
1378+
throw std::invalid_argument("error: invalid parameter for argument: " + arg);
1379+
}
13781380
}
13791381

13801382
if (params.prompt_cache_all &&
@@ -1422,6 +1424,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
14221424
printf(" -h, --help show this help message and exit\n");
14231425
printf(" --version show version and build info\n");
14241426
printf(" -i, --interactive run in interactive mode\n");
1427+
printf(" --interactive-specials allow special tokens in user text, in interactive mode\n");
14251428
printf(" --interactive-first run in interactive mode and wait for input right away\n");
14261429
printf(" -cnv, --conversation run in conversation mode (does not print special tokens and suffix/prefix)\n");
14271430
printf(" -ins, --instruct run in instruction mode (use with Alpaca models)\n");
@@ -2652,6 +2655,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
26522655
dump_string_yaml_multiline(stream, "in_suffix", params.input_prefix.c_str());
26532656
fprintf(stream, "instruct: %s # default: false\n", params.instruct ? "true" : "false");
26542657
fprintf(stream, "interactive: %s # default: false\n", params.interactive ? "true" : "false");
2658+
fprintf(stream, "interactive_specials: %s # default: false\n", params.interactive_specials ? "true" : "false");
26552659
fprintf(stream, "interactive_first: %s # default: false\n", params.interactive_first ? "true" : "false");
26562660
fprintf(stream, "keep: %d # default: 0\n", params.n_keep);
26572661
fprintf(stream, "logdir: %s # default: unset (no logging)\n", params.logdir.c_str());

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,7 @@ struct gpt_params {
140140
bool random_prompt = false; // do not randomize prompt if none provided
141141
bool use_color = false; // use color to distinguish generations and inputs
142142
bool interactive = false; // interactive mode
143+
bool interactive_specials = false; // whether to allow special tokens from user, during interactive mode
143144
bool conversation = false; // conversation mode (does not print special tokens and suffix/prefix)
144145
bool chatml = false; // chatml mode (used for models trained on chatml syntax)
145146
bool prompt_cache_all = false; // save user input and generations to prompt cache

common/grammar-parser.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,9 @@ namespace grammar_parser {
142142
pos++;
143143
last_sym_start = out_elements.size();
144144
while (*pos != '"') {
145+
if (!*pos) {
146+
throw std::runtime_error("unexpected end of input");
147+
}
145148
auto char_pair = parse_char(pos);
146149
pos = char_pair.second;
147150
out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first});
@@ -156,6 +159,9 @@ namespace grammar_parser {
156159
}
157160
last_sym_start = out_elements.size();
158161
while (*pos != ']') {
162+
if (!*pos) {
163+
throw std::runtime_error("unexpected end of input");
164+
}
159165
auto char_pair = parse_char(pos);
160166
pos = char_pair.second;
161167
enum llama_gretype type = last_sym_start < out_elements.size()
@@ -164,6 +170,9 @@ namespace grammar_parser {
164170

165171
out_elements.push_back({type, char_pair.first});
166172
if (pos[0] == '-' && pos[1] != ']') {
173+
if (!pos[1]) {
174+
throw std::runtime_error("unexpected end of input");
175+
}
167176
auto endchar_pair = parse_char(pos + 1);
168177
pos = endchar_pair.second;
169178
out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first});

common/sampling.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ struct llama_sampling_context * llama_sampling_init(const struct llama_sampling_
3535

3636
result->prev.resize(params.n_prev);
3737

38-
result->n_considered = 0;
38+
result->n_valid = 0;
3939

4040
llama_sampling_set_rng_seed(result, params.seed);
4141

@@ -66,7 +66,7 @@ void llama_sampling_reset(llama_sampling_context * ctx) {
6666

6767
std::fill(ctx->prev.begin(), ctx->prev.end(), 0);
6868
ctx->cur.clear();
69-
ctx->n_considered = 0;
69+
ctx->n_valid = 0;
7070
}
7171

7272
void llama_sampling_set_rng_seed(struct llama_sampling_context * ctx, uint32_t seed) {
@@ -256,7 +256,7 @@ static llama_token llama_sampling_sample_impl(
256256
}
257257
}
258258

259-
ctx_sampling->n_considered = cur_p.size;
259+
ctx_sampling->n_valid = temp == 0.0f ? 0 : cur_p.size;
260260

261261
return id;
262262
}

common/sampling.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ struct llama_sampling_context {
8181
// TODO: replace with ring-buffer
8282
std::vector<llama_token> prev;
8383
std::vector<llama_token_data> cur;
84-
size_t n_considered;
84+
size_t n_valid; // Number of correct top tokens with correct probabilities.
8585

8686
std::mt19937 rng;
8787
};

0 commit comments

Comments
 (0)