ggml-org
diff --git a/‎convert-hf-to-gguf-update.py
Lines changed: 1 addition & 0 deletions b/‎convert-hf-to-gguf-update.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎llama.cpp
Lines changed: 1 addition & 1 deletion b/‎llama.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎scripts/gen-unicode-data.py
Lines changed: 38 additions & 40 deletions b/‎scripts/gen-unicode-data.py
Lines changed: 38 additions & 40 deletions
@@ -261,6 +261,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:
     "3333333",
     "33333333",
     "333333333",
+    # "Cửa Việt", # llama-bpe fails on this
     chktxt,
 ]
 
 
@@ -12488,7 +12488,7 @@ struct llm_tokenizer_wpm {
                 continue;
             }
             code = unicode_tolower(code);
-            if (type == CODEPOINT_TYPE_WHITESPACE) {
+            if (type == CODEPOINT_TYPE_SEPARATOR) {
                 code = ' ';
             }
             std::string s = unicode_cpt_to_utf8(code);
 
@@ -1,31 +1,14 @@
 import regex
 
 
-def cpt_to_utf8_str(cpt):
-    if cpt <= 0xFF:
-        return bytes([cpt, 0, 0, 0])
-    elif cpt <= 0xFFFF:
-        return bytes([cpt & 0xFF, cpt >> 8, 0, 0])
-    elif cpt <= 0xFFFFFF:
-        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, 0])
-    else:
-        return bytes([cpt & 0xFF, (cpt >> 8) & 0xFF, (cpt >> 16) & 0xFF, cpt >> 24])
-
-
-def is_match(codepoint, regex_expr):
-    try:
-        res = regex.match(regex_expr, cpt_to_utf8_str(codepoint).decode('utf-32'))
-        return res is not None
-    except Exception:
-        return False
-
-
 def get_matches(regex_expr):
+    regex_expr_compiled = regex.compile(regex_expr)
     unicode_ranges = []
     current_range = None
 
     for codepoint in range(0x110000):
-        if is_match(codepoint, regex_expr):
+        char = chr(codepoint)
+        if regex_expr_compiled.match(char):
             if current_range is None:
                 current_range = [codepoint, codepoint]
             else:
@@ -40,27 +23,42 @@ def get_matches(regex_expr):
     return unicode_ranges
 
 
-def print_cat(cat, ranges):
-    print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
-    cnt = 0
-    for start, end in ranges:
-        if cnt % 4 != 0:
-            print(" ", end="") # noqa: NP100
-        print("{{0x{:08X}, 0x{:08X}}},".format(start, end), end="") # noqa: NP100
-        if cnt % 4 == 3:
-            print("") # noqa: NP100
-        cnt += 1
-
-    if cnt % 4 != 0:
-        print("") # noqa: NP100
+def print_cat(mode, cat, ranges):
+    if mode == "range":
+        print("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{".format(cat)) # noqa: NP100
+    if mode == "map":
+        print("const std::map<uint32_t, uint32_t> unicode_map_{} = {{".format(cat)) # noqa: NP100
+    for i, values in enumerate(ranges):
+        end = ",\n" if (i % 4 == 3 or i + 1 == len(ranges)) else ", "
+        values = ["0x%08X" % value for value in values]
+        print("{" + ", ".join(values) + "}", end=end) # noqa: NP100
     print("};") # noqa: NP100
     print("") # noqa: NP100
 
 
-print_cat("number",      get_matches(r'\p{N}'))
-print_cat("letter",      get_matches(r'\p{L}'))
-print_cat("whitespace",  get_matches(r'\p{Z}'))
-print_cat("accent_mark", get_matches(r'\p{M}'))
-print_cat("punctuation", get_matches(r'\p{P}'))
-print_cat("symbol",      get_matches(r'\p{S}'))
-print_cat("control",     get_matches(r'\p{C}'))
+print_cat("range", "number",      get_matches(r'\p{N}'))
+print_cat("range", "letter",      get_matches(r'\p{L}'))
+print_cat("range", "separator",   get_matches(r'\p{Z}'))
+print_cat("range", "accent_mark", get_matches(r'\p{M}'))
+print_cat("range", "punctuation", get_matches(r'\p{P}'))
+print_cat("range", "symbol",      get_matches(r'\p{S}'))
+print_cat("range", "control",     get_matches(r'\p{C}'))
+
+print_cat("range", "whitespace",  get_matches(r'\s'))
+
+
+map_lowercase = []
+map_uppercase = []
+for codepoint in range(0x110000):
+    char = chr(codepoint)
+    lower = ord(char.lower()[0])
+    upper = ord(char.upper()[0])
+    if codepoint != lower:
+        map_lowercase.append((codepoint, lower))
+    if codepoint != upper:
+        map_uppercase.append((codepoint, upper))
+print_cat("map", "lowercase", map_lowercase)
+print_cat("map", "uppercase", map_uppercase)
+
+
+# TODO: generate unicode_map_nfd
Original file line number	Diff line number	Diff line change
`@@ -261,6 +261,7 @@ def get_vocab_base_pre(self, tokenizer) -> str:`
`261`	`261`	`"3333333",`
`262`	`262`	`"33333333",`
`263`	`263`	`"333333333",`
	`264`	`+ # "Cửa Việt", # llama-bpe fails on this`
`264`	`265`	`chktxt,`
`265`	`266`	`]`
`266`	`267`
Original file line number	Diff line number	Diff line change
`@@ -12488,7 +12488,7 @@ struct llm_tokenizer_wpm {`
`12488`	`12488`	`continue;`
`12489`	`12489`	`}`
`12490`	`12490`	`code = unicode_tolower(code);`
`12491`		`- if (type == CODEPOINT_TYPE_WHITESPACE) {`
	`12491`	`+ if (type == CODEPOINT_TYPE_SEPARATOR) {`
`12492`	`12492`	`code = ' ';`
`12493`	`12493`	`}`
`12494`	`12494`	`std::string s = unicode_cpt_to_utf8(code);`