|
36 | 36 |
|
37 | 37 | fout = open(fname_out, "wb")
|
38 | 38 |
|
39 |
| -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex |
| 39 | +fout.write(struct.pack("i", 0x67676d66)) # magic: ggmf in hex |
| 40 | +fout.write(struct.pack("i", 1)) # file version |
40 | 41 | fout.write(struct.pack("i", n_vocab))
|
41 | 42 | fout.write(struct.pack("i", n_embd))
|
42 | 43 | fout.write(struct.pack("i", n_mult))
|
|
49 | 50 | # This loop unchanged from convert-pth-to-ggml.py:
|
50 | 51 | for i in range(tokenizer.vocab_size()):
|
51 | 52 | if tokenizer.is_unknown(i):
|
52 |
| - # "<unk>" token (translated as ??) |
53 | 53 | text = " \u2047 ".encode("utf-8")
|
54 |
| - fout.write(struct.pack("i", len(text))) |
55 |
| - fout.write(text) |
56 | 54 | elif tokenizer.is_control(i):
|
57 |
| - # "<s>"/"</s>" tokens |
58 |
| - fout.write(struct.pack("i", 0)) |
| 55 | + text = b"" |
59 | 56 | elif tokenizer.is_byte(i):
|
60 |
| - # "<U+XX>" tokens (which may be invalid UTF-8) |
61 | 57 | piece = tokenizer.id_to_piece(i)
|
62 | 58 | if len(piece) != 6:
|
63 |
| - print("Invalid token: " + piece) |
| 59 | + print(f"Invalid token: {piece}") |
64 | 60 | sys.exit(1)
|
65 | 61 | byte_value = int(piece[3:-1], 16)
|
66 |
| - fout.write(struct.pack("i", 1)) |
67 |
| - fout.write(struct.pack("B", byte_value)) |
| 62 | + text = struct.pack("B", byte_value) |
68 | 63 | else:
|
69 |
| - # normal token. Uses U+2581 (LOWER ONE EIGHTH BLOCK) to represent spaces. |
70 | 64 | text = tokenizer.id_to_piece(i).replace("\u2581", " ").encode("utf-8")
|
71 |
| - fout.write(struct.pack("i", len(text))) |
72 |
| - fout.write(text) |
| 65 | + fout.write(struct.pack("i", len(text))) |
| 66 | + fout.write(text) |
| 67 | + fout.write(struct.pack("f", tokenizer.get_score(i))) |
73 | 68 |
|
74 | 69 | def write_header(shape, dst_name, ftype_cur):
|
75 | 70 | sname = dst_name.encode('utf-8')
|
|
0 commit comments