Skip to content

Commit 49064d4

Browse files
committed
work towards tokenizer integration
1 parent 1a0a743 commit 49064d4

File tree

3 files changed

+30
-11
lines changed

3 files changed

+30
-11
lines changed

Makefile

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,8 @@ endif
3131
#
3232

3333
CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
34-
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
35-
LDFLAGS =
36-
34+
CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
35+
LDFLAGS =
3736
# OS specific
3837
# TODO: support Windows
3938
ifeq ($(UNAME_S),Linux)
@@ -188,7 +187,7 @@ clean:
188187
rm -f *.o main quantize
189188

190189
main: main.cpp ggml.o utils.o
191-
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
190+
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
192191
./main -h
193192

194193
quantize: quantize.cpp ggml.o utils.o

build_deps.sh

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
#https://github.com/google/sentencepiece.git
2+
#9ffb33a14c97c512103be0ee74740099660b39aa
3+
4+
curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
5+
tar xzvf sentencepiece-0.1.97.tar.gz
6+
cd sentencepiece-0.1.97/src
7+
mkdir build
8+
cd build
9+
cmake ..
10+
make sentencepiece-static -j $(nproc)
11+
cd ../..
12+

main.cpp

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,12 @@
1414
#include <signal.h>
1515
#include <unistd.h>
1616

17+
#include <sentencepiece_processor.h>
18+
19+
20+
//Tokenizer object
21+
sentencepiece::SentencePieceProcessor processor;
22+
1723
#define ANSI_COLOR_RED "\x1b[31m"
1824
#define ANSI_COLOR_GREEN "\x1b[32m"
1925
#define ANSI_COLOR_YELLOW "\x1b[33m"
@@ -758,6 +764,11 @@ void sigint_handler(int signo) {
758764
}
759765

760766
int main(int argc, char ** argv) {
767+
const auto status = processor.Load("models/tokenizer.model");
768+
if (!status.ok()) {
769+
printf("%s", status.ToString().c_str());
770+
// error
771+
}
761772
ggml_time_init();
762773
const int64_t t_main_start_us = ggml_time_us();
763774

@@ -807,7 +818,8 @@ int main(int argc, char ** argv) {
807818
std::vector<float> logits;
808819

809820
// tokenize the prompt
810-
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
821+
std::vector<gpt_vocab::id> embd_inp;
822+
processor.Encode(params.prompt, &embd_inp);
811823

812824
params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
813825

@@ -937,12 +949,8 @@ int main(int argc, char ** argv) {
937949
}
938950

939951
// display text
940-
if (!input_noecho) {
941-
for (auto id : embd) {
942-
printf("%s", vocab.id_to_token[id].c_str());
943-
}
944-
fflush(stdout);
945-
}
952+
std::string text;
953+
processor.Decode(all_tokens, &text);
946954

947955
// in interactive mode, and not currently processing queued inputs;
948956
// check if we should prompt the user for more

0 commit comments

Comments
 (0)