Skip to content

Commit 641944a

Browse files
author
jaime-m-p
committed
Unicode normalization NFD
1 parent 707a08d commit 641944a

File tree

4 files changed

+1858
-3
lines changed

4 files changed

+1858
-3
lines changed

scripts/gen-unicode-data.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import regex
22
import ctypes
3+
import unicodedata
34

45

56
class CoodepointFlags (ctypes.Structure):
@@ -32,6 +33,7 @@ class CoodepointFlags (ctypes.Structure):
3233
table_whitespace = []
3334
table_lowercase = []
3435
table_uppercase = []
36+
table_nfd = []
3537

3638
for codepoint in range(MAX_CODEPOINTS):
3739
# convert codepoint to unicode character
@@ -63,14 +65,30 @@ class CoodepointFlags (ctypes.Structure):
6365
if codepoint != upper:
6466
table_uppercase.append((codepoint, upper))
6567

68+
# NFD normalization
69+
norm = ord(unicodedata.normalize('NFD', char)[0])
70+
if codepoint != norm:
71+
table_nfd.append((codepoint, norm))
6672

67-
ranges_flags = [(0, codepoint_flags[0])]
73+
74+
# group ranges with same flags
75+
ranges_flags = [(0, codepoint_flags[0])] # start, flags
6876
for codepoint, flags in enumerate(codepoint_flags):
6977
if bytes(flags) != bytes(ranges_flags[-1][1]):
7078
ranges_flags.append((codepoint, flags))
7179
ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
7280

7381

82+
# group ranges with same nfd
83+
ranges_nfd = [(0, 0, 0)] # start, last, nfd
84+
for codepoint, norm in table_nfd:
85+
start = ranges_nfd[-1][0]
86+
if norm != ranges_nfd[-1][2]:
87+
ranges_nfd.append(None)
88+
start = codepoint
89+
ranges_nfd[-1] = (start, codepoint, norm)
90+
91+
7492
# Generate 'unicode-data.cpp'
7593

7694
print("""\
@@ -103,3 +121,8 @@ class CoodepointFlags (ctypes.Structure):
103121
for tuple in table_uppercase:
104122
print("{0x%06X, 0x%06X}," % tuple)
105123
print("};\n")
124+
125+
print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd")
126+
for triple in ranges_nfd:
127+
print("{0x%06X, 0x%06X, 0x%06X}," % triple)
128+
print("};\n")

0 commit comments

Comments
 (0)