Skip to content

Commit 734bdf5

Browse files
ggerganovhodlen
authored andcommitted
bert : add tests + fix quantization (ggml-org#5475)
* llama : do not quantize pos embd and token type tensors * ci : add BERT tests ggml-ci * ci : do not do BERT tests on low-perf nodes ggml-ci
1 parent 30e35de commit 734bdf5

File tree

2 files changed

+51
-1
lines changed

2 files changed

+51
-1
lines changed

ci/run.sh

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -568,6 +568,50 @@ function gg_sum_open_llama_7b_v2 {
568568
#gg_printf '- shakespeare (q8_0 / f16 base lora):\n```\n%s\n```\n' "$(cat $OUT/${ci}-ppl-shakespeare-lora-q8_0-f16.log)"
569569
}
570570

571+
# bge-small
572+
573+
function gg_run_embd_bge_small {
574+
cd ${SRC}
575+
576+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/config.json
577+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/tokenizer.model
578+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/tokenizer_config.json
579+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/special_tokens_map.json
580+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/resolve/main/pytorch_model.bin
581+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/sentence_bert_config.json
582+
gg_wget models-mnt/bge-small/ https://huggingface.co/BAAI/bge-small-en-v1.5/raw/main/vocab.txt
583+
584+
path_models="../models-mnt/bge-small"
585+
586+
rm -rf build-ci-release && mkdir build-ci-release && cd build-ci-release
587+
588+
set -e
589+
590+
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
591+
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log
592+
593+
python3 ../convert-hf-to-gguf.py ${path_models}
594+
595+
model_f16="${path_models}/ggml-model-f16.gguf"
596+
model_q8_0="${path_models}/ggml-model-q8_0.gguf"
597+
598+
./bin/quantize ${model_f16} ${model_q8_0} q8_0
599+
600+
(time ./bin/embedding --model ${model_f16} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-f16.log
601+
(time ./bin/embedding --model ${model_q8_0} -p "I believe the meaning of life is" ) 2>&1 | tee -a $OUT/${ci}-tg-q8_0.log
602+
603+
set +e
604+
}
605+
606+
function gg_sum_embd_bge_small {
607+
gg_printf '### %s\n\n' "${ci}"
608+
609+
gg_printf 'BGE Small (BERT):\n'
610+
gg_printf '- status: %s\n' "$(cat $OUT/${ci}.exit)"
611+
gg_printf '- f16: \n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-f16.log)"
612+
gg_printf '- q8_0:\n```\n%s\n```\n' "$(cat $OUT/${ci}-tg-q8_0.log)"
613+
}
614+
571615
## main
572616

573617
if [ -z ${GG_BUILD_LOW_PERF} ]; then
@@ -591,6 +635,8 @@ test $ret -eq 0 && gg_run ctest_debug
591635
test $ret -eq 0 && gg_run ctest_release
592636

593637
if [ -z ${GG_BUILD_LOW_PERF} ]; then
638+
test $ret -eq 0 && gg_run embd_bge_small
639+
594640
if [ -z ${GG_BUILD_VRAM_GB} ] || [ ${GG_BUILD_VRAM_GB} -ge 8 ]; then
595641
if [ -z ${GG_BUILD_CUDA} ]; then
596642
test $ret -eq 0 && gg_run open_llama_3b_v2

llama.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10444,7 +10444,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
1044410444
quantize &= !params->only_copy;
1044510445

1044610446
// do not quantize expert gating tensors
10447-
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
10447+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_FFN_GATE_INP, "weight");
10448+
10449+
// do not quantize positional embeddings and token types (BERT)
10450+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
10451+
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
1044810452

1044910453
enum ggml_type new_type;
1045010454
void * new_data;

0 commit comments

Comments
 (0)