Skip to content

Commit a9d8329

Browse files
author
jaime-m-p
committed
Minor + style
1 parent 9bc5d83 commit a9d8329

File tree

3 files changed

+22
-17
lines changed

3 files changed

+22
-17
lines changed

scripts/gen-unicode-data.py

Lines changed: 21 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,12 @@ class CoodepointFlags (ctypes.Structure):
9292

9393
# Generate 'unicode-data.cpp'
9494

95-
print("""\
95+
96+
def out(line=""):
97+
print(line, end='\n') # noqa
98+
99+
100+
out("""\
96101
// generated with scripts/gen-unicode-data.py
97102
98103
#include "unicode-data.h"
@@ -103,27 +108,27 @@ class CoodepointFlags (ctypes.Structure):
103108
#include <unordered_set>
104109
""")
105110

106-
print("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
111+
out("const std::vector<std::pair<uint32_t, uint16_t>> unicode_ranges_flags = { // start, flags // last=next_start-1")
107112
for codepoint, flags in ranges_flags:
108113
flags = int.from_bytes(bytes(flags), "little")
109-
print("{0x%06X, 0x%04X}," % (codepoint, flags))
110-
print("};\n")
114+
out("{0x%06X, 0x%04X}," % (codepoint, flags))
115+
out("};\n")
111116

112-
print("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
113-
print(", ".join("0x%06X" % cpt for cpt in table_whitespace))
114-
print("};\n")
117+
out("const std::unordered_set<uint32_t> unicode_set_whitespace = {")
118+
out(", ".join("0x%06X" % cpt for cpt in table_whitespace))
119+
out("};\n")
115120

116-
print("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
121+
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_lowercase = {")
117122
for tuple in table_lowercase:
118-
print("{0x%06X, 0x%06X}," % tuple)
119-
print("};\n")
123+
out("{0x%06X, 0x%06X}," % tuple)
124+
out("};\n")
120125

121-
print("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
126+
out("const std::unordered_map<uint32_t, uint32_t> unicode_map_uppercase = {")
122127
for tuple in table_uppercase:
123-
print("{0x%06X, 0x%06X}," % tuple)
124-
print("};\n")
128+
out("{0x%06X, 0x%06X}," % tuple)
129+
out("};\n")
125130

126-
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
131+
out("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
127132
for triple in ranges_nfd:
128-
print("{0x%06X, 0x%06X, 0x%06X}," % triple)
129-
print("};\n")
133+
out("{0x%06X, 0x%06X, 0x%06X}," % triple)
134+
out("};\n")

tests/test-tokenizer-random.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# python3 tests/test-tokenizer-random.py ./models/ggml-vocab-llama-bpe.gguf ./models/tokenizers/llama-bpe
77
#
88

9-
import os
109
import time
1110
import logging
1211
import argparse

unicode.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include <unordered_set>
1313
#include <utility>
1414
#include <vector>
15+
#include <array>
1516
#include <locale>
1617
#include <codecvt>
1718

0 commit comments

Comments
 (0)