Skip to content

Commit 79c8bdc

Browse files
committed
FIx parsing single-byte UTF-8 tokens by manually parsing the protobuf
1 parent 460c482 commit 79c8bdc

File tree

4 files changed

+396
-11
lines changed

4 files changed

+396
-11
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ ls ./models
139139
65B 30B 13B 7B tokenizer_checklist.chk tokenizer.model
140140

141141
# install Python dependencies
142-
python3 -m pip install torch numpy sentencepiece
142+
python3 -m pip install torch numpy protobuf
143143

144144
# convert the 7B model to ggml FP16 format
145145
python3 convert-pth-to-ggml.py models/7B/ 1

convert-pth-to-ggml.py

Lines changed: 27 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import numpy as np
2424
import torch
2525

26-
from sentencepiece import SentencePieceProcessor
26+
import sentencepiece_model_pb2
2727

2828
if len(sys.argv) < 3:
2929
print("Usage: convert-ckpt-to-ggml.py dir-model ftype\n")
@@ -68,9 +68,11 @@ def get_n_parts(dim):
6868
with open(fname_hparams, "r") as f:
6969
hparams = json.load(f)
7070

71-
tokenizer = SentencePieceProcessor(fname_tokenizer)
71+
tokenizer = sentencepiece_model_pb2.ModelProto()
72+
with open(fname_tokenizer, "rb") as f:
73+
tokenizer.ParseFromString(f.read())
7274

73-
hparams.update({"vocab_size": tokenizer.vocab_size()})
75+
hparams.update({"vocab_size": len(tokenizer.pieces)})
7476

7577
n_parts = get_n_parts(hparams["dim"])
7678

@@ -100,13 +102,28 @@ def get_n_parts(dim):
100102
fout.write(struct.pack("i", ftype))
101103

102104
# Is this correct??
103-
for i in range(32000):
104-
# TODO: this is probably wrong - not sure how this tokenizer works
105-
text = tokenizer.decode([29889, i]).encode('utf-8')
106-
# remove the first byte (it's always '.')
107-
text = text[1:]
108-
fout.write(struct.pack("i", len(text)))
109-
fout.write(text)
105+
for token in tokenizer.pieces:
106+
if token.type == 1:
107+
# normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces.
108+
text = token.piece.replace("\u2581", " ").encode("utf-8")
109+
fout.write(struct.pack("i", len(text)))
110+
fout.write(text)
111+
elif token.type == 2:
112+
# "<unk>" token (translated as ??)
113+
text = " \u2047 ".encode("utf-8")
114+
fout.write(struct.pack("i", len(text)))
115+
fout.write(text)
116+
elif token.type == 3:
117+
# "<s>"/"</s>" tokens
118+
fout.write(struct.pack("i", 0))
119+
elif token.type == 6:
120+
# "<U+XX>" tokens (which may be invalid UTF-8)
121+
if len(token.piece) != 6:
122+
print("Invalid token: " + token.piece)
123+
sys.exit(1)
124+
byte_value = int(token.piece[3:-1], 16)
125+
fout.write(struct.pack("i", 1))
126+
fout.write(struct.pack("B", byte_value))
110127

111128
for k, v in model.items():
112129
name = k

sentencepiece_model.proto

Lines changed: 324 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,324 @@
1+
// SOURCE: https://github.com/google/sentencepiece/blob/9ffb33a14c97c512103be0ee74740099660b39aa/src/sentencepiece_model.proto#L282
2+
3+
4+
// Copyright 2016 Google Inc.
5+
//
6+
// Licensed under the Apache License, Version 2.0 (the "License");
7+
// you may not use this file except in compliance with the License.
8+
// You may obtain a copy of the License at
9+
//
10+
// http://www.apache.org/licenses/LICENSE-2.0
11+
//
12+
// Unless required by applicable law or agreed to in writing, software
13+
// distributed under the License is distributed on an "AS IS" BASIS,
14+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
// See the License for the specific language governing permissions and
16+
// limitations under the License.!
17+
18+
syntax = "proto2";
19+
20+
// TODO(taku): Needs to use LITE RUNTIME in OSS release.
21+
option optimize_for = LITE_RUNTIME;
22+
23+
package sentencepiece;
24+
25+
// TrainerSpec encodes a various parameters for SentencePiece training.
26+
// Next id: 53
27+
message TrainerSpec {
28+
///////////////////////////////////////////////////////////////////
29+
// General parameters
30+
//
31+
// Input corpus files.
32+
// Trainer accepts the following two formats:
33+
// A) Monolingual: plain text, one sentence per line.
34+
// B) Bilingual: TSV, source sentence <tab> target sentence
35+
// When bilingual data is passed, shared vocabulary model is built.
36+
// Note that the input file must be raw corpus, not a preprocessed corpus.
37+
// Trainer only loads the first `input_sentence_size` sentences specified
38+
// with this parameter.
39+
repeated string input = 1;
40+
41+
// Input corpus format:
42+
// "text": one-sentence-per-line text format (default)
43+
// "tsv": sentence <tab> freq
44+
optional string input_format = 7;
45+
46+
// Output model file prefix.
47+
// <model_prefix>.model and <model_prefix>.vocab are generated.
48+
optional string model_prefix = 2;
49+
50+
// Model type. only have UNIGRAM now.
51+
enum ModelType {
52+
UNIGRAM = 1; // Unigram language model with dynamic algorithm
53+
BPE = 2; // Byte Pair Encoding
54+
WORD = 3; // Delimitered by whitespace.
55+
CHAR = 4; // tokenizes into character sequence
56+
}
57+
optional ModelType model_type = 3 [default = UNIGRAM];
58+
59+
// Vocabulary size. 8k is the default size.
60+
optional int32 vocab_size = 4 [default = 8000];
61+
62+
// List of the languages this model can accept.
63+
// Since the model is language-agnostic, this field is used as a reference.
64+
repeated string accept_language = 5;
65+
66+
// Size of self-test samples, which are encoded in the model file.
67+
optional int32 self_test_sample_size = 6 [default = 0];
68+
69+
// Whether to use DP version of sentencepiece. Use it with TSV input format
70+
// (requires precomputed word tab counts to work).
71+
optional bool enable_differential_privacy = 50 [default = false];
72+
// Set these parameters if you need DP version of sentencepiece.
73+
// std of noise to add.
74+
optional float differential_privacy_noise_level = 51 [default = 0.0];
75+
// Clipping threshold to apply after adding noise. All the words with
76+
// frequency less than this value are dropped.
77+
optional uint64 differential_privacy_clipping_threshold = 52 [default = 0];
78+
79+
///////////////////////////////////////////////////////////////////
80+
// Training parameters.
81+
//
82+
// Uses characters which cover the corpus with the ratio of `chars_coverage`.
83+
// This parameter determines the set of basic Alphabet of sentence piece.
84+
// 1.0 - `chars_coverage` characters are treated as UNK.
85+
// See also required_chars field.
86+
optional float character_coverage = 10 [default = 0.9995];
87+
88+
// Maximum size of sentences the trainer loads from `input` parameter.
89+
// Trainer simply loads the `input` files in sequence.
90+
// It is better to shuffle the input corpus randomly.
91+
optional uint64 input_sentence_size = 11 [default = 0];
92+
optional bool shuffle_input_sentence = 19 [default = true];
93+
94+
// Maximum size of sentences to make seed sentence pieces.
95+
// Extended suffix array is constructed to extract frequent
96+
// sub-strings from the corpus. This uses 20N working space,
97+
// where N is the size of corpus.
98+
optional int32 mining_sentence_size = 12 [deprecated = true];
99+
100+
// Maximum size of sentences to train sentence pieces.
101+
optional int32 training_sentence_size = 13 [deprecated = true];
102+
103+
// The size of seed sentencepieces.
104+
// `seed_sentencepiece_size` must be larger than `vocab_size`.
105+
optional int32 seed_sentencepiece_size = 14 [default = 1000000];
106+
107+
// In every EM sub-iterations, keeps top
108+
// `shrinking_factor` * `current sentencepieces size` with respect to
109+
// the loss of the sentence piece. This value should be smaller than 1.0.
110+
optional float shrinking_factor = 15 [default = 0.75];
111+
112+
// The maximum sentence length in byte. The sentences with the length
113+
// larger than `max_sentence_length` is simply ignored.
114+
// Longer input tends to bring the following risks:
115+
// * Overflow during EM training (unigram language model only)
116+
// * Performance drop because of O(n log n) cost in BPE.
117+
optional int32 max_sentence_length = 18 [default = 4192];
118+
119+
// Number of threads in the training.
120+
optional int32 num_threads = 16 [default = 16];
121+
122+
// Number of EM sub iterations.
123+
optional int32 num_sub_iterations = 17 [default = 2];
124+
125+
///////////////////////////////////////////////////////////////////
126+
// SentencePiece parameters which control the shapes of sentence piece.
127+
//
128+
// Maximum length of sentencepiece.
129+
optional int32 max_sentencepiece_length = 20 [default = 16];
130+
131+
// Uses Unicode script to split sentence pieces.
132+
// When `split_by_unicode_script` is true, we do not allow sentence piece to
133+
// include multiple Unicode scripts, e.g. "F1" is not a valid piece.
134+
// Exception: CJ characters (Hiragana/Katakana/Han) are all handled
135+
// as one script type, since Japanese word can consist of multiple scripts.
136+
// This exception is always applied regardless of the accept-language
137+
// parameter.
138+
optional bool split_by_unicode_script = 21 [default = true];
139+
140+
// When `split_by_number` is true, put a boundary between number and
141+
// non-number transition. If we want to treat "F1" is one token, set this flag
142+
// to be false.
143+
optional bool split_by_number = 23 [default = true];
144+
145+
// Use a white space to split sentence pieces.
146+
// When `split_by_whitespace` is false, we may have the piece containing
147+
// a white space in the middle. e.g., "in_the".
148+
optional bool split_by_whitespace = 22 [default = true];
149+
150+
// Adds whitespace symbol (_) as a suffix instead of prefix. e.g., _hello =>
151+
// hello_. When `treat_whitespace_as_suffix` is true,
152+
// NormalizerSpec::add_dummy_prefix will add the dummy whitespace to the end
153+
// of sentence.
154+
optional bool treat_whitespace_as_suffix = 24 [default = false];
155+
156+
// Allows pieces that only contain whitespaces instead of appearing only as
157+
// prefix or suffix of other pieces.
158+
optional bool allow_whitespace_only_pieces = 26 [default = false];
159+
160+
// Split all digits (0-9) into separate pieces.
161+
optional bool split_digits = 25 [default = false];
162+
163+
///////////////////////////////////////////////////////////////////
164+
// Vocabulary management
165+
//
166+
// Defines control symbols used as an indicator to
167+
// change the behavior of the decoder. <s> and </s> are pre-defined.
168+
// We can use this field to encode various meta information,
169+
// including language indicator in multilingual model.
170+
// These symbols are not visible to users, but visible to
171+
// the decoder. Note that when the input sentence contains control symbols,
172+
// they are not treated as one token, but segmented into normal pieces.
173+
// Control symbols must be inserted independently from the segmentation.
174+
repeated string control_symbols = 30;
175+
176+
// Defines user defined symbols.
177+
// These symbols are added with extremely high score
178+
// so they are always treated as one unique symbol in any context.
179+
// Typical usage of user_defined_symbols is placeholder for named entities.
180+
repeated string user_defined_symbols = 31;
181+
182+
// Defines required characters. Each UTF8 character in this string is included
183+
// in the character set regardless of character_coverage value. Unlike
184+
// user_defined_symbols, these characters have scores based on the frequency
185+
// on input sentences, and the model can form subwords using characters
186+
// in this field.
187+
optional string required_chars = 36;
188+
189+
// Decomposes unknown pieces into UTF-8 bytes.
190+
optional bool byte_fallback = 35 [default = false];
191+
192+
// When creating the vocabulary file, defines whether or not to additionally
193+
// output the score for each piece.
194+
optional bool vocabulary_output_piece_score = 32 [default = true];
195+
196+
// `vocab_size` is treated as hard limit. Crash if
197+
// the model can not produce the vocab of size `vocab_size`,
198+
// When `hard_vocab_limit` is false, vocab_size is treated
199+
// as soft limit. Note that when model_type=char,
200+
// always assumes hard_vocab_limit = false.
201+
optional bool hard_vocab_limit = 33 [default = true];
202+
203+
// use all symbols for vocab extraction. This flag is valid
204+
// if model type is either CHAR or WORD
205+
optional bool use_all_vocab = 34 [default = false];
206+
207+
///////////////////////////////////////////////////////////////////
208+
// Reserved special meta tokens.
209+
// * -1 is not used.
210+
// * unk_id must not be -1.
211+
// Id must starts with 0 and be contigous.
212+
optional int32 unk_id = 40 [default = 0]; // <unk>
213+
optional int32 bos_id = 41 [default = 1]; // <s>
214+
optional int32 eos_id = 42 [default = 2]; // </s>
215+
optional int32 pad_id = 43 [default = -1]; // <pad> (padding)
216+
optional string unk_piece = 45 [default = "<unk>"];
217+
optional string bos_piece = 46 [default = "<s>"];
218+
optional string eos_piece = 47 [default = "</s>"];
219+
optional string pad_piece = 48 [default = "<pad>"];
220+
221+
// Encodes <unk> into U+2047 (DOUBLE QUESTION MARK),
222+
// since this character can be useful both for user and
223+
// developer. We can easily figure out that <unk> is emitted.
224+
optional string unk_surface = 44 [default = " \xE2\x81\x87 "];
225+
226+
// Increase bit depth to allow unigram model training on large
227+
// (>10M sentences) corpora. A Side-effect of enabling this flag
228+
// is increased memory usage.
229+
optional bool train_extremely_large_corpus = 49 [default = false];
230+
231+
// Customized extensions: the range of field numbers
232+
// are open to third-party extensions.
233+
extensions 200 to max;
234+
}
235+
236+
// NormalizerSpec encodes a various parameters for string normalizaiton
237+
message NormalizerSpec {
238+
// name of normalization rule.
239+
optional string name = 1;
240+
241+
// Pre-compiled normalization rule created by
242+
// Builder::GetPrecompiledCharsMap() or Builder::CompileCharsMap() method.
243+
// Usually this field is set by Builder::GetNormalizerSpec() method.
244+
optional bytes precompiled_charsmap = 2;
245+
246+
// Adds dummy whitespace at the beginning of text in order to
247+
// treat "world" in "world" and "hello world" in the same way.
248+
optional bool add_dummy_prefix = 3 [default = true];
249+
250+
// Removes leading, trailing, and duplicate internal whitespace.
251+
optional bool remove_extra_whitespaces = 4 [default = true];
252+
253+
// Replaces whitespace with meta symbol.
254+
// This field must be true to train sentence piece model.
255+
optional bool escape_whitespaces = 5 [default = true];
256+
257+
// Custom normalization rule file in TSV format.
258+
// https://github.com/google/sentencepiece/blob/master/doc/normalization.md
259+
// This field is only used in SentencePieceTrainer::Train() method, which
260+
// compiles the rule into the binary rule stored in `precompiled_charsmap`.
261+
optional string normalization_rule_tsv = 6;
262+
263+
// Customized extensions: the range of field numbers
264+
// are open to third-party extensions.
265+
extensions 200 to max;
266+
}
267+
268+
// Proto to store samples for self-testing.
269+
message SelfTestData {
270+
message Sample {
271+
optional string input = 1;
272+
optional string expected = 2;
273+
}
274+
repeated Sample samples = 1;
275+
276+
// Customized extensions: the range of field numbers
277+
// are open to third-party extensions.
278+
extensions 200 to max;
279+
}
280+
281+
// ModelProto stores model parameters.
282+
// SentencePieceProcessor is supposed to be self-contained.
283+
// All settings/parameters which may change the behavior must be encoded
284+
// in ModelProto.
285+
message ModelProto {
286+
message SentencePiece {
287+
enum Type {
288+
NORMAL = 1; // normal symbol
289+
UNKNOWN = 2; // unknown symbol. only <unk> for now.
290+
CONTROL = 3; // control symbols. </s>, <s>, <2ja> etc.
291+
USER_DEFINED = 4; // user defined symbols.
292+
// Typical usage of USER_DEFINED symbol
293+
// is placeholder.
294+
BYTE = 6; // byte symbols. Used when `byte_fallback` is true.
295+
UNUSED = 5; // this piece is not used.
296+
}
297+
optional string piece = 1; // piece must not be empty.
298+
optional float score = 2;
299+
optional Type type = 3 [default = NORMAL];
300+
301+
// Customized extensions: the range of field numbers
302+
// are open to third-party extensions.
303+
extensions 200 to max;
304+
}
305+
306+
// Sentence pieces with scores.
307+
repeated SentencePiece pieces = 1;
308+
309+
// Spec used to generate this model file.
310+
optional TrainerSpec trainer_spec = 2;
311+
312+
// Spec for text normalization.
313+
optional NormalizerSpec normalizer_spec = 3;
314+
315+
// Stores sample input and its expected segmentation to verify the model.
316+
optional SelfTestData self_test_data = 4;
317+
318+
// Spec for text de-normalization.
319+
optional NormalizerSpec denormalizer_spec = 5;
320+
321+
// Customized extensions: the range of field numbers
322+
// are open to third-party extensions.
323+
extensions 200 to max;
324+
}

0 commit comments

Comments
 (0)