|
1 | 1 | import regex
|
2 | 2 | import ctypes
|
| 3 | +import unicodedata |
3 | 4 |
|
4 | 5 |
|
5 | 6 | class CoodepointFlags (ctypes.Structure):
|
@@ -32,6 +33,7 @@ class CoodepointFlags (ctypes.Structure):
|
32 | 33 | table_whitespace = []
|
33 | 34 | table_lowercase = []
|
34 | 35 | table_uppercase = []
|
| 36 | +table_nfd = [] |
35 | 37 |
|
36 | 38 | for codepoint in range(MAX_CODEPOINTS):
|
37 | 39 | # convert codepoint to unicode character
|
@@ -63,14 +65,30 @@ class CoodepointFlags (ctypes.Structure):
|
63 | 65 | if codepoint != upper:
|
64 | 66 | table_uppercase.append((codepoint, upper))
|
65 | 67 |
|
| 68 | + # NFD normalization |
| 69 | + norm = ord(unicodedata.normalize('NFD', char)[0]) |
| 70 | + if codepoint != norm: |
| 71 | + table_nfd.append((codepoint, norm)) |
66 | 72 |
|
67 |
| -ranges_flags = [(0, codepoint_flags[0])] |
| 73 | + |
| 74 | +# group ranges with same flags |
| 75 | +ranges_flags = [(0, codepoint_flags[0])] # start, flags |
68 | 76 | for codepoint, flags in enumerate(codepoint_flags):
|
69 | 77 | if bytes(flags) != bytes(ranges_flags[-1][1]):
|
70 | 78 | ranges_flags.append((codepoint, flags))
|
71 | 79 | ranges_flags.append((MAX_CODEPOINTS, CoodepointFlags()))
|
72 | 80 |
|
73 | 81 |
|
| 82 | +# group ranges with same nfd |
| 83 | +ranges_nfd = [(0, 0, 0)] # start, last, nfd |
| 84 | +for codepoint, norm in table_nfd: |
| 85 | + start = ranges_nfd[-1][0] |
| 86 | + if norm != ranges_nfd[-1][2]: |
| 87 | + ranges_nfd.append(None) |
| 88 | + start = codepoint |
| 89 | + ranges_nfd[-1] = (start, codepoint, norm) |
| 90 | + |
| 91 | + |
74 | 92 | # Generate 'unicode-data.cpp'
|
75 | 93 |
|
76 | 94 | print("""\
|
@@ -103,3 +121,8 @@ class CoodepointFlags (ctypes.Structure):
|
103 | 121 | for tuple in table_uppercase:
|
104 | 122 | print("{0x%06X, 0x%06X}," % tuple)
|
105 | 123 | print("};\n")
|
| 124 | + |
| 125 | +print("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_nfd = { // start, last, nfd") |
| 126 | +for triple in ranges_nfd: |
| 127 | + print("{0x%06X, 0x%06X, 0x%06X}," % triple) |
| 128 | +print("};\n") |
0 commit comments