Skip to content

Commit d21c440

Browse files
committed
Update on "Let models provider their own specific special tokens"
Differential Revision: [D59651199](https://our.internmc.facebook.com/intern/diff/D59651199/) [ghstack-poisoned]
2 parents 3ee6876 + d114cc5 commit d21c440

File tree

1 file changed

+25
-23
lines changed

1 file changed

+25
-23
lines changed

examples/models/llama2/tokenizer/llama_tiktoken.cpp

Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -12,23 +12,23 @@ namespace torch {
1212
namespace executor {
1313
namespace {
1414
static constexpr int32_t kSpecialTokensSize = 256;
15-
static std::string kBOSToken = "<|begin_of_text|>";
1615
static constexpr size_t kBOSTokenIndex = 0;
17-
static std::string kEOSToken = "<|end_of_text|>";
1816
static constexpr size_t kEOSTokenIndex = 1;
1917

2018
static inline std::unique_ptr<std::vector<std::string>>
2119
_get_default_special_tokens() {
22-
auto special_tokens = std::make_unique<std::vector<std::string>>(
23-
std::vector<std::string>{kBOSToken, kEOSToken});
24-
special_tokens->emplace_back("<|reserved_special_token_0|>");
25-
special_tokens->emplace_back("<|reserved_special_token_1|>");
26-
special_tokens->emplace_back("<|reserved_special_token_2|>");
27-
special_tokens->emplace_back("<|reserved_special_token_3|>");
28-
special_tokens->emplace_back("<|start_header_id|>");
29-
special_tokens->emplace_back("<|end_header_id|>");
30-
special_tokens->emplace_back("<|reserved_special_token_4|>");
31-
special_tokens->emplace_back("<|eot_id|>");
20+
auto special_tokens =
21+
std::make_unique<std::vector<std::string>>(std::vector<std::string>{
22+
"<|begin_of_text|>",
23+
"<|end_of_text|>",
24+
"<|reserved_special_token_0|>",
25+
"<|reserved_special_token_1|>",
26+
"<|reserved_special_token_2|>",
27+
"<|reserved_special_token_3|>",
28+
"<|start_header_id|>",
29+
"<|end_header_id|>",
30+
"<|reserved_special_token_4|>",
31+
"<|eot_id|>"});
3232

3333
// pad the rest of the special tokens with reserved tokens
3434
ssize_t reserved_special_token_num = 5;
@@ -42,17 +42,19 @@ _get_default_special_tokens() {
4242

4343
static inline std::unique_ptr<std::vector<std::string>>
4444
_get_multimodal_special_tokens() {
45-
auto special_tokens = std::make_unique<std::vector<std::string>>(
46-
std::vector<std::string>{kBOSToken, kEOSToken});
47-
special_tokens->emplace_back("<|reserved_special_token_0|>");
48-
special_tokens->emplace_back("<|reserved_special_token_1|>");
49-
special_tokens->emplace_back("<|reserved_special_token_2|>");
50-
special_tokens->emplace_back("<|reserved_special_token_3|>");
51-
special_tokens->emplace_back("<|start_header_id|>");
52-
special_tokens->emplace_back("<|end_header_id|>");
53-
special_tokens->emplace_back("<|eom_id|>");
54-
special_tokens->emplace_back("<|eot_id|>");
55-
special_tokens->emplace_back("<|image|>");
45+
auto special_tokens =
46+
std::make_unique<std::vector<std::string>>(std::vector<std::string>{
47+
"<|begin_of_text|>",
48+
"<|end_of_text|>",
49+
"<|reserved_special_token_0|>",
50+
"<|reserved_special_token_1|>",
51+
"<|reserved_special_token_2|>",
52+
"<|reserved_special_token_3|>",
53+
"<|start_header_id|>",
54+
"<|end_header_id|>",
55+
"<|eom_id|>",
56+
"<|eot_id|>",
57+
"<|image|>"});
5658

5759
// pad the rest of the special tokens with reserved tokens except the last
5860
// one

0 commit comments

Comments
 (0)