Skip to content

Commit 5e1c3ae

Browse files
authored
convert : fix nomic-bert-moe mask token (#13757)
1 parent c496fe0 commit 5e1c3ae

File tree

2 files changed

+13
-2
lines changed

2 files changed

+13
-2
lines changed

convert_hf_to_gguf.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3889,6 +3889,12 @@ def _xlmroberta_set_vocab(self) -> None:
38893889
SentencePieceTokenTypes.UNKNOWN,
38903890
] + toktypes[3:-1]
38913891

3892+
if self.model_arch == gguf.MODEL_ARCH.NOMIC_BERT_MOE:
3893+
# Add mask token missing from sentencepiece.bpe.model
3894+
tokens[250001] = b'<mask>'
3895+
scores[250001] = 0.0
3896+
toktypes[250001] = SentencePieceTokenTypes.CONTROL
3897+
38923898
self.gguf_writer.add_tokenizer_model("t5")
38933899
self.gguf_writer.add_tokenizer_pre("default")
38943900
self.gguf_writer.add_token_list(tokens)

src/llama-vocab.cpp

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2080,9 +2080,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20802080

20812081
std::string model_name;
20822082
std::string tokenizer_pre;
2083+
std::string general_arch;
20832084

20842085
ml.get_key(LLM_KV_GENERAL_NAME, model_name, false);
20852086
ml.get_key(LLM_KV_TOKENIZER_PRE, tokenizer_pre, false);
2087+
ml.get_key(LLM_KV_GENERAL_ARCHITECTURE, general_arch, false);
20862088

20872089
// model name to lowercase
20882090
std::transform(model_name.begin(), model_name.end(), model_name.begin(),
@@ -2091,8 +2093,11 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
20912093
}
20922094
);
20932095

2094-
// set attributes by model/tokenizer name
2095-
if (_contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})) {
2096+
// set attributes by model/tokenizer/architecture name
2097+
if (false
2098+
|| _contains_any(tokenizer_pre, {"jina-v2-de", "jina-v2-es", "jina-v2-code"})
2099+
|| _contains_any(general_arch, {"nomic-bert-moe"})
2100+
) {
20962101
_set_token_attr("<mask>", LLAMA_TOKEN_ATTR_LSTRIP, true);
20972102
} else if (_contains_any(model_name, {"phi-3", "phi3"})) {
20982103
for (auto id : cache_special_tokens) {

0 commit comments

Comments
 (0)