Skip to content

Commit eb34620

Browse files
authored
Add tokenizer test + revert to C++11 (#355)
* Add test-tokenizer-0 to do a few tokenizations - feel free to expand * Added option to convert-pth-to-ggml.py script to dump just the vocabulary * Added ./models/ggml-vocab.bin containing just LLaMA vocab data (used for tests) * Added utility to load vocabulary file from previous point (temporary implementation) * Avoid using std::string_view and drop back to C++11 (hope I didn't break something) * Rename gpt_vocab -> llama_vocab * All CMake binaries go into ./bin/ now
1 parent 2e664f1 commit eb34620

File tree

11 files changed

+249
-148
lines changed

11 files changed

+249
-148
lines changed

.github/workflows/build.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,7 @@ jobs:
5454
cd build
5555
cmake ..
5656
cmake --build . --config Release
57+
ctest --output-on-failure
5758
5859
macOS-latest-make:
5960
runs-on: macos-latest
@@ -90,6 +91,7 @@ jobs:
9091
cd build
9192
cmake ..
9293
cmake --build . --config Release
94+
ctest --output-on-failure
9395
9496
windows-latest-cmake:
9597
runs-on: windows-latest
@@ -106,6 +108,7 @@ jobs:
106108
cd build
107109
cmake ..
108110
cmake --build . --config Release
111+
ctest --output-on-failure
109112
110113
- name: Get commit hash
111114
id: commit

CMakeLists.txt

Lines changed: 51 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,37 @@
1-
cmake_minimum_required(VERSION 3.12)
1+
cmake_minimum_required(VERSION 3.12) # Don't bump this version for no reason
22
project("llama.cpp" C CXX)
33

4+
set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
5+
46
if (NOT XCODE AND NOT MSVC AND NOT CMAKE_BUILD_TYPE)
57
set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
68
set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
79
endif()
810

11+
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
12+
13+
if(CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
14+
set(LLAMA_STANDALONE ON)
15+
16+
# configure project version
17+
# TODO
18+
else()
19+
set(LLAMA_STANDALONE OFF)
20+
endif()
21+
22+
if (EMSCRIPTEN)
23+
set(BUILD_SHARED_LIBS_DEFAULT OFF)
24+
25+
option(LLAMA_WASM_SINGLE_FILE "llama: embed WASM inside the generated llama.js" ON)
26+
else()
27+
if (MINGW)
28+
set(BUILD_SHARED_LIBS_DEFAULT OFF)
29+
else()
30+
set(BUILD_SHARED_LIBS_DEFAULT ON)
31+
endif()
32+
endif()
33+
34+
935
#
1036
# Option list
1137
#
@@ -34,6 +60,9 @@ option(LLAMA_FMA "llama: enable FMA"
3460
option(LLAMA_ACCELERATE "llama: enable Accelerate framework" ON)
3561
option(LLAMA_OPENBLAS "llama: use OpenBLAS" OFF)
3662

63+
option(LLAMA_BUILD_TESTS "llama: build tests" ${LLAMA_STANDALONE})
64+
option(LLAMA_BUILD_EXAMPLES "llama: build examples" ${LLAMA_STANDALONE})
65+
3766
#
3867
# Compile flags
3968
#
@@ -187,17 +216,19 @@ add_executable(llama main.cpp)
187216

188217
add_executable(quantize quantize.cpp)
189218

190-
add_library(ggml OBJECT
191-
ggml.c
192-
ggml.h)
193-
194219
add_library(utils OBJECT
195220
utils.cpp
196221
utils.h)
197222

223+
target_include_directories(utils PUBLIC .)
224+
target_compile_features(utils PUBLIC cxx_std_11) # don't bump
225+
226+
add_library(ggml OBJECT
227+
ggml.c
228+
ggml.h)
229+
198230
target_include_directories(ggml PUBLIC .)
199-
target_compile_features(ggml PUBLIC c_std_11)
200-
target_compile_features(utils PUBLIC cxx_std_17)
231+
target_compile_features(ggml PUBLIC c_std_11) # don't bump
201232

202233
#
203234
# Linking
@@ -206,3 +237,16 @@ target_compile_features(utils PUBLIC cxx_std_17)
206237
target_link_libraries(ggml PRIVATE Threads::Threads ${LLAMA_EXTRA_LIBS})
207238
target_link_libraries(llama PRIVATE ggml utils)
208239
target_link_libraries(quantize PRIVATE ggml utils)
240+
241+
#
242+
# programs, examples and tests
243+
#
244+
245+
if (LLAMA_BUILD_TESTS AND NOT CMAKE_JS_VERSION)
246+
enable_testing()
247+
add_subdirectory(tests)
248+
endif ()
249+
250+
#if (LLAMA_BUILD_EXAMPLES)
251+
# add_subdirectory(examples)
252+
#endif()

Makefile

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,9 @@ endif
3030
# Compile flags
3131
#
3232

33+
# keep standard at C11 and C++11
3334
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34-
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++17 -fPIC
35+
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
3536
LDFLAGS =
3637

3738
# OS specific

convert-pth-to-ggml.py

Lines changed: 25 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,26 +10,26 @@
1010
# - Name (char[name_length])
1111
# - Data (float[n_dims])
1212
#
13-
# By default, the bigger matrices are converted to 16-bit floats.
14-
# This can be disabled by adding the "use-f32" CLI argument.
15-
#
1613
# At the start of the ggml file we write the model parameters
1714
# and vocabulary.
1815
#
16+
1917
import argparse
2018
import os
2119
import sys
2220
import json
2321
import struct
2422
import numpy as np
2523
import torch
24+
2625
from sentencepiece import SentencePieceProcessor
2726

2827
def parse_args():
2928

3029
parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
3130
parser.add_argument('dir_model', help='directory containing the model checkpoint')
3231
parser.add_argument('ftype', type=int, choices=[0, 1], default=1, help='file type (0: float32, 1: float16)')
32+
parser.add_argument('vocab_only', type=bool, default=False, help='only write vocab to file')
3333
return parser.parse_args()
3434

3535
def get_n_parts(dim):
@@ -134,6 +134,27 @@ def main():
134134
ftype_str = ["f32", "f16"]
135135

136136
hparams, tokenizer = load_hparams_and_tokenizer(dir_model)
137+
138+
# if only writing vocab to file
139+
if args.vocab_only:
140+
141+
fname_model = f"{dir_model}/consolidated.00.pth"
142+
fname_out = f"{dir_model}/ggml-vocab.bin"
143+
144+
print(f"Extracting only the vocab from '{fname_model}'\n")
145+
146+
model = torch.load(fname_model, map_location="cpu")
147+
148+
with open(fname_out, "wb") as fout:
149+
fout.write(struct.pack("i", hparams["vocab_size"]))
150+
write_tokens(fout, tokenizer)
151+
152+
del model
153+
154+
print(f"Done. Output file: {fname_out}\n")
155+
156+
return
157+
137158
n_parts = get_n_parts(hparams["dim"])
138159

139160
for p in range(n_parts):
@@ -151,6 +172,7 @@ def main():
151172
process_and_write_variables(fout, model, ftype)
152173

153174
del model
175+
154176
print(f"Done. Output file: {fname_out}, (part {p})\n")
155177

156178
if __name__ == "__main__":

main.cpp

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -90,7 +90,7 @@ struct llama_model {
9090
};
9191

9292
// load the model's weights from a file
93-
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
93+
bool llama_model_load(const std::string & fname, llama_model & model, llama_vocab & vocab, int n_ctx, ggml_type memory_type = GGML_TYPE_F32) {
9494
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
9595

9696
std::vector<char> f_buf(1024*1024);
@@ -544,9 +544,9 @@ bool llama_eval(
544544
const llama_model & model,
545545
const int n_threads,
546546
const int n_past,
547-
const std::vector<gpt_vocab::id> & embd_inp,
548-
std::vector<float> & embd_w,
549-
size_t & mem_per_token) {
547+
const std::vector<llama_vocab::id> & embd_inp,
548+
std::vector<float> & embd_w,
549+
size_t & mem_per_token) {
550550
const int N = embd_inp.size();
551551

552552
const auto & hparams = model.hparams;
@@ -832,7 +832,7 @@ int main(int argc, char ** argv) {
832832

833833
int64_t t_load_us = 0;
834834

835-
gpt_vocab vocab;
835+
llama_vocab vocab;
836836
llama_model model;
837837

838838
// load the model
@@ -864,13 +864,13 @@ int main(int argc, char ** argv) {
864864
// Add a space in front of the first character to match OG llama tokenizer behavior
865865
params.prompt.insert(0, 1, ' ');
866866
// tokenize the prompt
867-
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
867+
std::vector<llama_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
868868

869869
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
870870

871871
// prefix & suffix for instruct mode
872-
const std::vector<gpt_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
873-
const std::vector<gpt_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
872+
const std::vector<llama_vocab::id> inp_pfx = ::llama_tokenize(vocab, "\n\n### Instruction:\n\n", true);
873+
const std::vector<llama_vocab::id> inp_sfx = ::llama_tokenize(vocab, "\n\n### Response:\n\n", false);
874874

875875
// in instruct mode, we inject a prefix and a suffix to each input by the user
876876
if (params.instruct) {
@@ -879,8 +879,8 @@ int main(int argc, char ** argv) {
879879
}
880880

881881
// tokenize the reverse prompt
882-
std::vector<std::vector<gpt_vocab::id>> antipromptv_inp;
883-
882+
std::vector<std::vector<llama_vocab::id>> antipromptv_inp;
883+
884884
for (auto antiprompt : params.antiprompt) {
885885
antipromptv_inp.push_back(::llama_tokenize(vocab, antiprompt, false));
886886
}
@@ -925,14 +925,14 @@ int main(int argc, char ** argv) {
925925
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
926926
fprintf(stderr, "\n\n");
927927

928-
std::vector<gpt_vocab::id> embd;
928+
std::vector<llama_vocab::id> embd;
929929

930930
// determine the required inference memory per token:
931931
size_t mem_per_token = 0;
932932
llama_eval(model, params.n_threads, 0, { 0, 1, 2, 3 }, logits, mem_per_token);
933933

934934
int last_n_size = params.repeat_last_n;
935-
std::vector<gpt_vocab::id> last_n_tokens(last_n_size);
935+
std::vector<llama_vocab::id> last_n_tokens(last_n_size);
936936
std::fill(last_n_tokens.begin(), last_n_tokens.end(), 0);
937937

938938
if (params.interactive) {
@@ -980,7 +980,7 @@ int main(int argc, char ** argv) {
980980

981981
const int n_vocab = model.hparams.n_vocab;
982982

983-
gpt_vocab::id id = 0;
983+
llama_vocab::id id = 0;
984984

985985
{
986986
const int64_t t_start_sample_us = ggml_time_us();
@@ -1066,7 +1066,7 @@ int main(int argc, char ** argv) {
10661066
} while (another_line);
10671067
if (params.use_color) printf(ANSI_COLOR_RESET);
10681068

1069-
std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
1069+
std::vector<llama_vocab::id> line_inp = ::llama_tokenize(vocab, buffer, false);
10701070
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
10711071

10721072
if (params.instruct) {

models/ggml-vocab.bin

422 KB
Binary file not shown.

quantize.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,7 +44,7 @@ bool llama_model_quantize(const std::string & fname_inp, const std::string & fna
4444
return false;
4545
}
4646

47-
gpt_vocab vocab;
47+
llama_vocab vocab;
4848

4949
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
5050

tests/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
set(TEST_TARGET test-tokenizer-0)
2+
add_executable(${TEST_TARGET} ${TEST_TARGET}.cpp)
3+
target_link_libraries(${TEST_TARGET} PRIVATE utils)
4+
add_test(NAME ${TEST_TARGET} COMMAND $<TARGET_FILE:${TEST_TARGET}> ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab.bin)

tests/test-tokenizer-0.cpp

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#include "utils.h"
2+
3+
#include <cstdio>
4+
#include <string>
5+
#include <map>
6+
7+
static const std::map<std::string, std::vector<llama_vocab::id>> k_tests = {
8+
{ "Hello World", { 1, 10994, 2787, }, },
9+
{ " Hello World", { 1, 15043, 2787, }, },
10+
{ " Hello World!", { 1, 15043, 2787, 29991, }, },
11+
{ " this is 🦙.cpp", { 1, 445, 338, 29871, 243, 162, 169, 156, 29889, 8223, }, },
12+
{ "w048 7tuijk dsdfhu", { 1, 29893, 29900, 29946, 29947, 29871, 29955, 9161, 13535, 18031, 2176, 6905, }, },
13+
{ "нещо на Български", { 1, 821, 4851, 665, 1386, 29713, 1305, }, },
14+
};
15+
16+
int main(int argc, char **argv) {
17+
if (argc < 2) {
18+
fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
19+
return 1;
20+
}
21+
22+
const std::string fname = argv[1];
23+
24+
fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
25+
26+
llama_vocab vocab;
27+
28+
if (!llama_vocab_load(fname, vocab)) {
29+
fprintf(stderr, "%s : failed to load vocab from: '%s'\n", __func__, fname.c_str());
30+
return 1;
31+
}
32+
33+
const int n_vocab = vocab.id_to_token.size();
34+
35+
if (n_vocab != 32000) {
36+
fprintf(stderr, "%s : expected 32000 tokens, got %d\n", __func__, n_vocab);
37+
return 2;
38+
}
39+
40+
for (const auto & test_kv : k_tests) {
41+
const auto res = llama_tokenize(vocab, test_kv.first, true);
42+
43+
bool correct = res.size() == test_kv.second.size();
44+
45+
for (int i = 0; i < (int) res.size() && correct; ++i) {
46+
if (res[i] != test_kv.second[i]) {
47+
correct = false;
48+
}
49+
}
50+
51+
if (!correct) {
52+
fprintf(stderr, "%s : failed test: '%s'\n", __func__, test_kv.first.c_str());
53+
fprintf(stderr, "%s : expected tokens: ", __func__);
54+
for (const auto & t : test_kv.second) {
55+
fprintf(stderr, "%6d, ", t);
56+
}
57+
fprintf(stderr, "\n");
58+
fprintf(stderr, "%s : got tokens: ", __func__);
59+
for (const auto & t : res) {
60+
fprintf(stderr, "%6d, ", t);
61+
}
62+
fprintf(stderr, "\n");
63+
64+
return 3;
65+
}
66+
}
67+
68+
return 0;
69+
}

0 commit comments

Comments
 (0)