work towards tokenizer integration

beiller · beiller · commit 49064d439aa7 · 2023-03-12T19:28:09.000-04:00
diff --git a/Makefile b/Makefile
@@ -31,9 +31,8 @@ endif
 #
 
 CFLAGS   = -I.              -O3 -DNDEBUG -std=c11   -fPIC
-CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC
-LDFLAGS  =
-
+CXXFLAGS = -I. -I../../sentencepiece/src/ -O3 -DNDEBUG -std=c++11 -fPIC
+LDFLAGS  = 
 # OS specific
 # TODO: support Windows
 ifeq ($(UNAME_S),Linux)
@@ -188,7 +187,7 @@ clean:
 	rm -f *.o main quantize
 
 main: main.cpp ggml.o utils.o
-	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
+	$(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o /Users/billhamilton/src/sentencepiece/build/src/libsentencepiece.a -o main $(LDFLAGS)
 	./main -h
 
 quantize: quantize.cpp ggml.o utils.o
diff --git a/build_deps.sh b/build_deps.sh
@@ -0,0 +1,12 @@
+#https://github.com/google/sentencepiece.git
+#9ffb33a14c97c512103be0ee74740099660b39aa
+
+curl -LO https://github.com/google/sentencepiece/releases/download/v0.1.97/sentencepiece-0.1.97.tar.gz
+tar xzvf sentencepiece-0.1.97.tar.gz
+cd sentencepiece-0.1.97/src
+mkdir build
+cd build
+cmake ..
+make sentencepiece-static -j $(nproc)
+cd ../..
+
diff --git a/main.cpp b/main.cpp
@@ -14,6 +14,12 @@
 #include <signal.h>
 #include <unistd.h>
 
+#include <sentencepiece_processor.h>
+
+
+//Tokenizer object
+sentencepiece::SentencePieceProcessor processor;
+
 #define ANSI_COLOR_RED     "\x1b[31m"
 #define ANSI_COLOR_GREEN   "\x1b[32m"
 #define ANSI_COLOR_YELLOW  "\x1b[33m"
@@ -758,6 +764,11 @@ void sigint_handler(int signo) {
 }
 
 int main(int argc, char ** argv) {
+    const auto status = processor.Load("models/tokenizer.model");
+    if (!status.ok()) {
+       printf("%s", status.ToString().c_str());
+       // error
+    }
     ggml_time_init();
     const int64_t t_main_start_us = ggml_time_us();
 
@@ -807,7 +818,8 @@ int main(int argc, char ** argv) {
     std::vector<float> logits;
 
     // tokenize the prompt
-    std::vector<gpt_vocab::id> embd_inp = ::llama_tokenize(vocab, params.prompt, true);
+    std::vector<gpt_vocab::id> embd_inp;
+    processor.Encode(params.prompt, &embd_inp);
 
     params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
 
@@ -937,12 +949,8 @@ int main(int argc, char ** argv) {
         }
 
         // display text
-        if (!input_noecho) {
-            for (auto id : embd) {
-                printf("%s", vocab.id_to_token[id].c_str());
-            }
-            fflush(stdout);
-        }
+        std::string text;
+        processor.Decode(all_tokens, &text);
 
         // in interactive mode, and not currently processing queued inputs;
         // check if we should prompt the user for more