1
1
import regex
2
- import unicodedata
3
-
4
-
5
- if False :
6
-
7
- # This code is equivalent to: cpt.to_bytes(4, "little"))
8
- def cpt_to_utf8_str (cpt ):
9
- if cpt <= 0xFF :
10
- return bytes ([cpt , 0 , 0 , 0 ])
11
- elif cpt <= 0xFFFF :
12
- return bytes ([cpt & 0xFF , cpt >> 8 , 0 , 0 ])
13
- elif cpt <= 0xFFFFFF :
14
- return bytes ([cpt & 0xFF , (cpt >> 8 ) & 0xFF , (cpt >> 16 ) & 0xFF , 0 ])
15
- else :
16
- return bytes ([cpt & 0xFF , (cpt >> 8 ) & 0xFF , (cpt >> 16 ) & 0xFF , cpt >> 24 ])
17
-
18
- # This code is equivalent to: regex_expr_compiled.match(chr(codepoint))
19
- def is_match (codepoint , regex_expr ):
20
- try :
21
- res = regex_expr .match (cpt_to_utf8_str (codepoint ).decode ('utf-32' ))
22
- return res is not None
23
- except Exception :
24
- return False
25
-
26
- # Verify previous statements, using chr() and ord()
27
- for codepoint in range (0x110000 ):
28
- temp = cpt_to_utf8_str (codepoint )
29
- assert (temp == codepoint .to_bytes (4 , "little" ))
30
- try :
31
- char = temp .decode ('utf-32' )
32
- if codepoint == 0xFEFF : # BOM
33
- assert (char == "" ) # why?
34
- char = "\uFEFF "
35
- except UnicodeDecodeError :
36
- continue
37
- assert (char == chr (codepoint ) )
38
- assert (ord (char ) == codepoint )
39
2
40
3
41
4
def get_matches (regex_expr ):
@@ -63,13 +26,11 @@ def get_matches(regex_expr):
63
26
def print_cat (mode , cat , ranges ):
64
27
if mode == "range" :
65
28
print ("const std::vector<std::pair<uint32_t, uint32_t>> unicode_ranges_{} = {{" .format (cat ))
66
- if mode == "range_value" :
67
- print ("const std::vector<std::tuple<uint32_t, uint32_t, uint32_t>> unicode_ranges_{} = {{" .format (cat ))
68
29
if mode == "map" :
69
30
print ("const std::map<uint32_t, uint32_t> unicode_map_{} = {{" .format (cat ))
70
31
for i , values in enumerate (ranges ):
71
32
end = ",\n " if (i % 4 == 3 or i + 1 == len (ranges )) else ", "
72
- values = ["0x%08X" % value for value in values ]
33
+ values = ["0x%08X" % value for value in values ]
73
34
print ("{" + ", " .join (values ) + "}" , end = end )
74
35
print ("};" )
75
36
print ("" )
@@ -93,22 +54,11 @@ def print_cat(mode, cat, ranges):
93
54
lower = ord (char .lower ()[0 ])
94
55
upper = ord (char .upper ()[0 ])
95
56
if codepoint != lower :
96
- map_lowercase .append ((codepoint ,lower ))
57
+ map_lowercase .append ((codepoint , lower ))
97
58
if codepoint != upper :
98
- map_uppercase .append ((codepoint ,upper ))
59
+ map_uppercase .append ((codepoint , upper ))
99
60
print_cat ("map" , "lowercase" , map_lowercase )
100
61
print_cat ("map" , "uppercase" , map_uppercase )
101
62
102
63
103
- # TODO: this is wrong
104
- # inv_map_nfd = {}
105
- # for codepoint in range(0x110000):
106
- # char = chr(codepoint)
107
- # norm = ord(unicodedata.normalize('NFD', char)[0])
108
- # if codepoint != norm:
109
- # a, b = inv_map_nfd.get(norm, (codepoint, codepoint))
110
- # inv_map_nfd[norm] = (min(a, codepoint), max(b, codepoint))
111
- # nfd_ranges = [ (a, b, nfd) for nfd,(a,b) in inv_map_nfd.items() ]
112
- # nfd_ranges = list(sorted(nfd_ranges))
113
- # del inv_map_nfd
114
- # print_cat("range_value", "nfd", nfd_ranges)
64
+ # TODO: generate unicode_map_nfd
0 commit comments