Skip to content

Add sentencepiece tokenizer and modify build (Support UTF-8 / Emoijs) #66

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:

- name: Build
run: |
make
sh build.sh

macOS-latest:
runs-on: macOS-latest
Expand All @@ -31,7 +31,7 @@ jobs:

- name: Build
run: |
make
sh build.sh

# ubuntu-latest-gcc:
# runs-on: ubuntu-latest
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ models/*

arm_neon.h
compile_commands.json
deps
7 changes: 3 additions & 4 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,8 @@ endif
#

CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC
CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
LDFLAGS =

CXXFLAGS = -I. -Ideps/sentencepiece-0.1.97/src/ -O3 -DNDEBUG -std=c++17 -fPIC
LDFLAGS =
# OS specific
# TODO: support Windows
ifeq ($(UNAME_S),Linux)
Expand Down Expand Up @@ -188,7 +187,7 @@ clean:
rm -f *.o main quantize

main: main.cpp ggml.o utils.o
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o deps/libsentencepiece.a -o main $(LDFLAGS)
./main -h

quantize: quantize.cpp ggml.o utils.o
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ Here are the step for the LLaMA-7B model:
# build this repo
git clone https://github.com/ggerganov/llama.cpp
cd llama.cpp
make
./build.sh

# obtain the original LLaMA model weights and place them in ./models
ls ./models
Expand Down
23 changes: 23 additions & 0 deletions build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#!/bin/sh

if [ ! -d deps ]
then
mkdir deps
fi
cd deps
if [ ! -f v0.1.97.tar.gz ]
then
curl -LO https://github.com/google/sentencepiece/archive/refs/tags/v0.1.97.tar.gz
fi
if [ ! -f libsentencepiece.a ]
then
tar xzvf v0.1.97.tar.gz
cd sentencepiece-0.1.97/ && rm -rf build && mkdir build && cd build
cmake --version
cmake ..
make sentencepiece-static -j $(nproc)
cd ../..
cp sentencepiece-0.1.97/build/src/libsentencepiece.a ./
fi
cd ..
make
52 changes: 42 additions & 10 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@
#include <signal.h>
#include <unistd.h>

#include <sentencepiece_processor.h>


//Tokenizer object
sentencepiece::SentencePieceProcessor processor;

#define ANSI_COLOR_RED "\x1b[31m"
#define ANSI_COLOR_GREEN "\x1b[32m"
#define ANSI_COLOR_YELLOW "\x1b[33m"
Expand Down Expand Up @@ -758,6 +764,11 @@ void sigint_handler(int signo) {
}

int main(int argc, char ** argv) {
const auto status = processor.Load("models/tokenizer.model");
if (!status.ok()) {
printf("%s", status.ToString().c_str());
// error
}
ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();

Expand Down Expand Up @@ -807,12 +818,14 @@ int main(int argc, char ** argv) {
std::vector<float> logits;

// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
std::vector<gpt_vocab::id> embd_inp;
processor.Encode(params.prompt, &embd_inp);

params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());

// tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);
std::vector<gpt_vocab::id> antiprompt_inp;
processor.Encode(params.antiprompt, &antiprompt_inp);

printf("\n");
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
Expand Down Expand Up @@ -843,6 +856,8 @@ int main(int argc, char ** argv) {
printf("\n\n");

std::vector<gpt_vocab::id> embd;
std::vector<gpt_vocab::id> all_tokens;
std::string full_text = "";

// determine the required inference memory per token:
size_t mem_per_token = 0;
Expand Down Expand Up @@ -908,6 +923,7 @@ int main(int argc, char ** argv) {

last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(id);
all_tokens.push_back(id);

t_sample_us += ggml_time_us() - t_start_sample_us;
}
Expand All @@ -926,6 +942,7 @@ int main(int argc, char ** argv) {
embd.push_back(embd_inp[input_consumed]);
last_n_tokens.erase(last_n_tokens.begin());
last_n_tokens.push_back(embd_inp[input_consumed]);
all_tokens.push_back(embd_inp[input_consumed]);
++input_consumed;
if (embd.size() > params.n_batch) {
break;
Expand All @@ -935,14 +952,28 @@ int main(int argc, char ** argv) {

// display text
if (!input_noecho) {
for (auto id : embd) {
printf("%s", vocab.id_to_token[id].c_str());
}
// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);
// check if last token is unprintable token
std::string check;
std::vector<gpt_vocab::id> check_token;
check_token.push_back(all_tokens.at(all_tokens.size()-1));
processor.Decode(check_token, &check);
if(check != "�") {
// If the token is printable we wont attempt to print unprintable tokens
std::string text;
processor.Decode(all_tokens, &text);
if(full_text.length() < text.length()) {
std::string chunk = text.substr(full_text.length());
printf("%s", chunk.c_str());
full_text.empty();
processor.Decode(all_tokens, &full_text);
// reset color to default if we there is no pending user input
if (params.use_color && embd_inp.size() <= input_consumed) {
printf(ANSI_COLOR_RESET);
}
fflush(stdout);
}

}
fflush(stdout);
}

// in interactive mode, and not currently processing queued inputs;
Expand Down Expand Up @@ -973,7 +1004,8 @@ int main(int argc, char ** argv) {
buf[n_read+1] = 0;
}

std::vector<gpt_vocab::id> line_inp = ::llama_tokenize(vocab, buf, false);
std::vector<gpt_vocab::id> line_inp;
processor.Encode(buf, &antiprompt_inp);
embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());

input_noecho = true; // do not echo this again
Expand Down