Skip to content

Commit d735894

Browse files
committed
Merge remote-tracking branch 'origin/master' into macos_vulkan
2 parents b51b69e + 4524290 commit d735894

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

66 files changed

+5015
-2200
lines changed

.flake8

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
[flake8]
22
max-line-length = 125
3+
ignore = W503

.github/workflows/python-lint.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,5 +16,5 @@ jobs:
1616
- name: flake8 Lint
1717
uses: py-actions/flake8@v2
1818
with:
19-
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704"
19+
ignore: "E203,E211,E221,E225,E231,E241,E251,E261,E266,E501,E701,E704,W503"
2020
exclude: "examples/*,examples/*/**,*/**/__init__.py"

CMakeLists.txt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -855,11 +855,21 @@ if (CMAKE_OSX_ARCHITECTURES STREQUAL "arm64" OR CMAKE_GENERATOR_PLATFORM_LWR STR
855855
CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm.*|ARM64)$"))
856856
message(STATUS "ARM detected")
857857
if (MSVC)
858+
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
858859
add_compile_definitions(__ARM_NEON)
859860
add_compile_definitions(__ARM_FEATURE_FMA)
860-
add_compile_definitions(__ARM_FEATURE_DOTPROD)
861-
# add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) # MSVC doesn't support vdupq_n_f16, vld1q_f16, vst1q_f16
862-
add_compile_definitions(__aarch64__) # MSVC defines _M_ARM64 instead
861+
862+
set(CMAKE_REQUIRED_FLAGS_PREV ${CMAKE_REQUIRED_FLAGS})
863+
string(JOIN " " CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS} "/arch:armv8.2")
864+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { int8x16_t _a, _b; int32x4_t _s = vdotq_s32(_s, _a, _b); return 0; }" GGML_COMPILER_SUPPORT_DOTPROD)
865+
if (GGML_COMPILER_SUPPORT_DOTPROD)
866+
add_compile_definitions(__ARM_FEATURE_DOTPROD)
867+
endif ()
868+
check_cxx_source_compiles("#include <arm_neon.h>\nint main() { float16_t _a; float16x8_t _s = vdupq_n_f16(_a); return 0; }" GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
869+
if (GGML_COMPILER_SUPPORT_FP16_VECTOR_ARITHMETIC)
870+
add_compile_definitions(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
871+
endif ()
872+
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_PREV})
863873
else()
864874
check_cxx_compiler_flag(-mfp16-format=ieee COMPILER_SUPPORTS_FP16_FORMAT_I3E)
865875
if (NOT "${COMPILER_SUPPORTS_FP16_FORMAT_I3E}" STREQUAL "")

Makefile

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -569,6 +569,14 @@ $(info I CC: $(shell $(CC) --version | head -n 1))
569569
$(info I CXX: $(shell $(CXX) --version | head -n 1))
570570
ifdef LLAMA_CUBLAS
571571
$(info I NVCC: $(shell $(NVCC) --version | tail -n 1))
572+
CUDA_VERSION := $(shell nvcc --version | grep -oP 'release (\K[0-9]+\.[0-9])')
573+
ifeq ($(shell awk -v "v=$(CUDA_VERSION)" 'BEGIN { print (v < 11.7) }'),1)
574+
ifndef CUDA_DOCKER_ARCH
575+
ifndef CUDA_POWER_ARCH
576+
$(error I ERROR: For CUDA versions < 11.7 a target CUDA architecture must be explicitly provided via CUDA_DOCKER_ARCH)
577+
endif # CUDA_POWER_ARCH
578+
endif # CUDA_DOCKER_ARCH
579+
endif # eq ($(shell echo "$(CUDA_VERSION) < 11.7" | bc),1)
572580
endif # LLAMA_CUBLAS
573581
$(info )
574582

Package.swift

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -13,17 +13,31 @@ let package = Package(
1313
products: [
1414
.library(name: "llama", targets: ["llama"]),
1515
],
16-
dependencies: [
17-
.package(url: "https://github.com/ggerganov/ggml.git", .branch("release"))
18-
],
1916
targets: [
2017
.target(
2118
name: "llama",
22-
dependencies: ["ggml"],
2319
path: ".",
24-
exclude: ["ggml-metal.metal"],
20+
exclude: [
21+
"cmake",
22+
"examples",
23+
"scripts",
24+
"models",
25+
"tests",
26+
"CMakeLists.txt",
27+
"ggml-cuda.cu",
28+
"ggml-cuda.h",
29+
"Makefile"
30+
],
2531
sources: [
32+
"ggml.c",
2633
"llama.cpp",
34+
"ggml-alloc.c",
35+
"ggml-backend.c",
36+
"ggml-quants.c",
37+
"ggml-metal.m",
38+
],
39+
resources: [
40+
.process("ggml-metal.metal")
2741
],
2842
publicHeadersPath: "spm-headers",
2943
cSettings: [

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -958,7 +958,7 @@ We have three Docker images available for this project:
958958
959959
1. `ghcr.io/ggerganov/llama.cpp:full`: This image includes both the main executable file and the tools to convert LLaMA models into ggml and convert into 4-bit quantization. (platforms: `linux/amd64`, `linux/arm64`)
960960
2. `ghcr.io/ggerganov/llama.cpp:light`: This image only includes the main executable file. (platforms: `linux/amd64`, `linux/arm64`)
961-
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executabhle file. (platforms: `linux/amd64`, `linux/arm64`)
961+
3. `ghcr.io/ggerganov/llama.cpp:server`: This image only includes the server executable file. (platforms: `linux/amd64`, `linux/arm64`)
962962
963963
Additionally, there the following images, similar to the above:
964964

ci/run.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
568568
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
569569
}
570570

571+
# bge-small
572+
573+
function gg_run_embd_bge_small {
574+
cd ${SRC}
575+
576+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
577+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
578+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
579+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
580+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
581+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
582+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
583+
584+
path_models="../models-mnt/bge-small"
585+
586+
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
587+
588+
set -e
589+
590+
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
591+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
592+
593+
python3 ../convert-hf-to-gguf.py ${path_models}
594+
595+
model_f16="${path_models}/ggml-model-f16.gguf"
596+
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
597+
598+
./bin/quantize ${model_f16} ${model_q8_0} q8_0
599+
600+
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
601+
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
602+
603+
set +e
604+
}
605+
606+
function gg_sum_embd_bge_small {
607+
gg_printf '### %s\n\n' "${ci}"
608+
609+
gg_printf 'BGE Small (BERT):\n'
610+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
611+
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
612+
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
613+
}
614+
571615
## main
572616

573617
if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
591635
test $ret -eq 0 && gg_run ctest_release
592636

593637
if [ -z ${GG_BUILD_LOW_PERF} ]; then
638+
test $ret -eq 0 && gg_run embd_bge_small
639+
594640
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
595641
if [ -z ${GG_BUILD_CUDA} ]; then
596642
test $ret -eq 0 && gg_run open_llama_3b_v2

common/common.cpp

Lines changed: 85 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -340,13 +340,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
340340
invalid_param = true;
341341
break;
342342
}
343-
sparams.samplers_sequence = parse_samplers_input(argv[i]);
343+
const auto sampler_names = string_split(argv[i], ';');
344+
sparams.samplers_sequence = sampler_types_from_names(sampler_names);
344345
} else if (arg == "--sampling-seq") {
345346
if (++i >= argc) {
346347
invalid_param = true;
347348
break;
348349
}
349-
sparams.samplers_sequence = argv[i];
350+
sparams.samplers_sequence = sampler_types_from_chars(argv[i]);
350351
} else if (arg == "--top-p") {
351352
if (++i >= argc) {
352353
invalid_param = true;
@@ -906,6 +907,14 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
906907
void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
907908
const llama_sampling_params & sparams = params.sparams;
908909

910+
std::string sampler_type_chars;
911+
std::string sampler_type_names;
912+
for (const auto sampler_type : sparams.samplers_sequence) {
913+
sampler_type_chars += static_cast<char>(sampler_type);
914+
sampler_type_names += sampler_type_to_name_string(sampler_type) + ";";
915+
}
916+
sampler_type_names.pop_back();
917+
909918
printf("\n");
910919
printf("usage: %s [options]\n", argv[0]);
911920
printf("\n");
@@ -947,8 +956,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
947956
printf(" -n N, --n-predict N number of tokens to predict (default: %d, -1 = infinity, -2 = until context filled)\n", params.n_predict);
948957
printf(" -c N, --ctx-size N size of the prompt context (default: %d, 0 = loaded from model)\n", params.n_ctx);
949958
printf(" -b N, --batch-size N batch size for prompt processing (default: %d)\n", params.n_batch);
950-
printf(" --samplers samplers that will be used for generation in the order, separated by \';\', for example: \"top_k;tfs;typical;top_p;min_p;temp\"\n");
951-
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sparams.samplers_sequence.c_str());
959+
printf(" --samplers samplers that will be used for generation in the order, separated by \';\' (default: %s)\n", sampler_type_names.c_str());
960+
printf(" --sampling-seq simplified sequence for samplers that will be used (default: %s)\n", sampler_type_chars.c_str());
952961
printf(" --top-k N top-k sampling (default: %d, 0 = disabled)\n", sparams.top_k);
953962
printf(" --top-p N top-p sampling (default: %.1f, 1.0 = disabled)\n", (double)sparams.top_p);
954963
printf(" --min-p N min-p sampling (default: %.1f, 0.0 = disabled)\n", (double)sparams.min_p);
@@ -1097,45 +1106,85 @@ std::string gpt_random_prompt(std::mt19937 & rng) {
10971106
}
10981107

10991108
//
1100-
// String parsing
1109+
// String utils
11011110
//
11021111

1103-
std::string parse_samplers_input(std::string input) {
1104-
std::string output = "";
1112+
std::vector<std::string> string_split(std::string input, char separator) {
1113+
std::vector<std::string> parts;
1114+
size_t separator_pos = input.find(separator);
1115+
while (separator_pos != std::string::npos) {
1116+
std::string part = input.substr(0, separator_pos);
1117+
parts.emplace_back(part);
1118+
input = input.substr(separator_pos + 1);
1119+
separator_pos = input.find(separator);
1120+
}
1121+
parts.emplace_back(input);
1122+
return parts;
1123+
}
1124+
1125+
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names) {
11051126
// since samplers names are written multiple ways
11061127
// make it ready for both system names and input names
1107-
std::unordered_map<std::string, char> samplers_symbols {
1108-
{"top_k", 'k'},
1109-
{"top-k", 'k'},
1110-
{"top_p", 'p'},
1111-
{"top-p", 'p'},
1112-
{"nucleus", 'p'},
1113-
{"typical_p", 'y'},
1114-
{"typical-p", 'y'},
1115-
{"typical", 'y'},
1116-
{"min_p", 'm'},
1117-
{"min-p", 'm'},
1118-
{"tfs_z", 'f'},
1119-
{"tfs-z", 'f'},
1120-
{"tfs", 'f'},
1121-
{"temp", 't'},
1122-
{"temperature",'t'}
1128+
std::unordered_map<std::string, llama_sampler_type> sampler_name_map {
1129+
{"top_k", llama_sampler_type::TOP_K},
1130+
{"top-k", llama_sampler_type::TOP_K},
1131+
{"top_p", llama_sampler_type::TOP_P},
1132+
{"top-p", llama_sampler_type::TOP_P},
1133+
{"nucleus", llama_sampler_type::TOP_P},
1134+
{"typical_p", llama_sampler_type::TYPICAL_P},
1135+
{"typical-p", llama_sampler_type::TYPICAL_P},
1136+
{"typical", llama_sampler_type::TYPICAL_P},
1137+
{"min_p", llama_sampler_type::MIN_P},
1138+
{"min-p", llama_sampler_type::MIN_P},
1139+
{"tfs_z", llama_sampler_type::TFS_Z},
1140+
{"tfs-z", llama_sampler_type::TFS_Z},
1141+
{"tfs", llama_sampler_type::TFS_Z},
1142+
{"temp", llama_sampler_type::TEMP},
1143+
{"temperature", llama_sampler_type::TEMP}
1144+
};
1145+
1146+
std::vector<llama_sampler_type> sampler_types;
1147+
sampler_types.reserve(names.size());
1148+
for (const auto& name : names) {
1149+
const auto sampler_item = sampler_name_map.find(name);
1150+
if (sampler_item != sampler_name_map.end()) {
1151+
sampler_types.push_back(sampler_item->second);
1152+
}
1153+
}
1154+
return sampler_types;
1155+
}
1156+
1157+
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string) {
1158+
std::unordered_map<char, llama_sampler_type> sampler_name_map {
1159+
{'k', llama_sampler_type::TOP_K},
1160+
{'p', llama_sampler_type::TOP_P},
1161+
{'y', llama_sampler_type::TYPICAL_P},
1162+
{'m', llama_sampler_type::MIN_P},
1163+
{'f', llama_sampler_type::TFS_Z},
1164+
{'t', llama_sampler_type::TEMP}
11231165
};
1124-
// expected format example: "temp;top_k;tfs_z;typical_p;top_p;min_p"
1125-
size_t separator = input.find(';');
1126-
while (separator != input.npos) {
1127-
std::string name = input.substr(0,separator);
1128-
input = input.substr(separator+1);
1129-
separator = input.find(';');
1130-
1131-
if (samplers_symbols.find(name) != samplers_symbols.end()) {
1132-
output += samplers_symbols[name];
1166+
1167+
std::vector<llama_sampler_type> sampler_types;
1168+
sampler_types.reserve(names_string.size());
1169+
for (const auto & c : names_string) {
1170+
const auto sampler_item = sampler_name_map.find(c);
1171+
if (sampler_item != sampler_name_map.end()) {
1172+
sampler_types.push_back(sampler_item->second);
11331173
}
11341174
}
1135-
if (samplers_symbols.find(input) != samplers_symbols.end()) {
1136-
output += samplers_symbols[input];
1175+
return sampler_types;
1176+
}
1177+
1178+
std::string sampler_type_to_name_string(llama_sampler_type sampler_type) {
1179+
switch (sampler_type) {
1180+
case llama_sampler_type::TOP_K: return "top_k";
1181+
case llama_sampler_type::TFS_Z: return "tfs_z";
1182+
case llama_sampler_type::TYPICAL_P: return "typical_p";
1183+
case llama_sampler_type::TOP_P: return "top_p";
1184+
case llama_sampler_type::MIN_P: return "min_p";
1185+
case llama_sampler_type::TEMP: return "temp";
1186+
default : return "";
11371187
}
1138-
return output;
11391188
}
11401189

11411190
//
@@ -1550,6 +1599,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
15501599
fprintf(stream, "cpu_has_blas: %s\n", ggml_cpu_has_blas() ? "true" : "false");
15511600
fprintf(stream, "cpu_has_sse3: %s\n", ggml_cpu_has_sse3() ? "true" : "false");
15521601
fprintf(stream, "cpu_has_vsx: %s\n", ggml_cpu_has_vsx() ? "true" : "false");
1602+
fprintf(stream, "cpu_has_matmul_int8: %s\n", ggml_cpu_has_matmul_int8() ? "true" : "false");
15531603

15541604
#ifdef NDEBUG
15551605
fprintf(stream, "debug: false\n");

common/common.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -162,10 +162,13 @@ std::string gpt_random_prompt(std::mt19937 & rng);
162162
void process_escapes(std::string& input);
163163

164164
//
165-
// String parsing
165+
// String utils
166166
//
167167

168-
std::string parse_samplers_input(std::string input);
168+
std::vector<llama_sampler_type> sampler_types_from_names(const std::vector<std::string> & names);
169+
std::vector<llama_sampler_type> sampler_types_from_chars(const std::string & names_string);
170+
std::vector<std::string> string_split(std::string input, char separator);
171+
std::string sampler_type_to_name_string(llama_sampler_type sampler_type);
169172

170173
//
171174
// Model utils

common/sampling.cpp

Lines changed: 14 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -103,15 +103,10 @@ std::string llama_sampling_print(const llama_sampling_params & params) {
103103
std::string llama_sampling_order_print(const llama_sampling_params & params) {
104104
std::string result = "CFG -> Penalties ";
105105
if (params.mirostat == 0) {
106-
for (auto s : params.samplers_sequence) {
107-
switch (s) {
108-
case 'k': result += "-> top_k "; break;
109-
case 'f': result += "-> tfs_z "; break;
110-
case 'y': result += "-> typical_p "; break;
111-
case 'p': result += "-> top_p "; break;
112-
case 'm': result += "-> min_p "; break;
113-
case 't': result += "-> temp "; break;
114-
default : break;
106+
for (auto sampler_type : params.samplers_sequence) {
107+
const auto sampler_type_name = sampler_type_to_name_string(sampler_type);
108+
if (!sampler_type_name.empty()) {
109+
result += "-> " + sampler_type_name + " ";
115110
}
116111
}
117112
} else {
@@ -127,8 +122,6 @@ static void sampler_queue(
127122
const llama_sampling_params & params,
128123
llama_token_data_array & cur_p,
129124
size_t & min_keep) {
130-
const int n_vocab = llama_n_vocab(llama_get_model(ctx_main));
131-
132125
const float temp = params.temp;
133126
const float dynatemp_range = params.dynatemp_range;
134127
const float dynatemp_exponent = params.dynatemp_exponent;
@@ -137,16 +130,16 @@ static void sampler_queue(
137130
const float min_p = params.min_p;
138131
const float tfs_z = params.tfs_z;
139132
const float typical_p = params.typical_p;
140-
const std::string & samplers_sequence = params.samplers_sequence;
141-
142-
for (auto s : samplers_sequence) {
143-
switch (s){
144-
case 'k': llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
145-
case 'f': llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
146-
case 'y': llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
147-
case 'p': llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
148-
case 'm': llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
149-
case 't':
133+
const std::vector<llama_sampler_type> & samplers_sequence = params.samplers_sequence;
134+
135+
for (auto sampler_type : samplers_sequence) {
136+
switch (sampler_type) {
137+
case llama_sampler_type::TOP_K : llama_sample_top_k (ctx_main, &cur_p, top_k, min_keep); break;
138+
case llama_sampler_type::TFS_Z : llama_sample_tail_free(ctx_main, &cur_p, tfs_z, min_keep); break;
139+
case llama_sampler_type::TYPICAL_P: llama_sample_typical (ctx_main, &cur_p, typical_p, min_keep); break;
140+
case llama_sampler_type::TOP_P : llama_sample_top_p (ctx_main, &cur_p, top_p, min_keep); break;
141+
case llama_sampler_type::MIN_P : llama_sample_min_p (ctx_main, &cur_p, min_p, min_keep); break;
142+
case llama_sampler_type::TEMP:
150143
if (dynatemp_range > 0) {
151144
float dynatemp_min = std::max(0.0f, temp - dynatemp_range);
152145
float dynatemp_max = std::max(0.0f, temp + dynatemp_range);

0 commit comments

Comments
 (0)