26
26
# written by Fredrik Lundh ([email protected] )
27
27
#
28
28
29
+ import dataclasses
29
30
import os
30
31
import sys
31
32
import zipfile
32
33
33
34
from functools import partial
34
35
from textwrap import dedent
35
- from typing import Iterator , List , Tuple
36
+ from typing import Iterator , List , Optional , Set , Tuple
36
37
37
38
SCRIPT = sys .argv [0 ]
38
39
VERSION = "3.3"
@@ -148,12 +149,12 @@ def makeunicodedata(unicode, trace):
148
149
record = unicode .table [char ]
149
150
if record :
150
151
# extract database properties
151
- category = CATEGORY_NAMES .index (record [ 2 ] )
152
- combining = int (record [ 3 ] )
153
- bidirectional = BIDIRECTIONAL_NAMES .index (record [ 4 ] )
154
- mirrored = record [ 9 ] == "Y"
155
- eastasianwidth = EASTASIANWIDTH_NAMES .index (record [ 15 ] )
156
- normalizationquickcheck = record [ 17 ]
152
+ category = CATEGORY_NAMES .index (record . general_category )
153
+ combining = int (record . canonical_combining_class )
154
+ bidirectional = BIDIRECTIONAL_NAMES .index (record . bidi_class )
155
+ mirrored = record . bidi_mirrored == "Y"
156
+ eastasianwidth = EASTASIANWIDTH_NAMES .index (record . east_asian_width )
157
+ normalizationquickcheck = record . quick_check
157
158
item = (
158
159
category , combining , bidirectional , mirrored , eastasianwidth ,
159
160
normalizationquickcheck
@@ -179,8 +180,8 @@ def makeunicodedata(unicode, trace):
179
180
for char in unicode .chars :
180
181
record = unicode .table [char ]
181
182
if record :
182
- if record [ 5 ] :
183
- decomp = record [ 5 ] .split ()
183
+ if record . decomposition_type :
184
+ decomp = record . decomposition_type .split ()
184
185
if len (decomp ) > 19 :
185
186
raise Exception ("character %x has a decomposition too large for nfd_nfkd" % char )
186
187
# prefix
@@ -200,7 +201,7 @@ def makeunicodedata(unicode, trace):
200
201
# Collect NFC pairs
201
202
if not prefix and len (decomp ) == 3 and \
202
203
char not in unicode .exclusions and \
203
- unicode .table [decomp [1 ]][ 3 ] == "0" :
204
+ unicode .table [decomp [1 ]]. canonical_combining_class == "0" :
204
205
p , l , r = decomp
205
206
comp_first [l ] = 1
206
207
comp_last [r ] = 1
@@ -404,9 +405,9 @@ def makeunicodetype(unicode, trace):
404
405
record = unicode .table [char ]
405
406
if record :
406
407
# extract database properties
407
- category = record [ 2 ]
408
- bidirectional = record [ 4 ]
409
- properties = record [ 16 ]
408
+ category = record . general_category
409
+ bidirectional = record . bidi_class
410
+ properties = record . binary_properties
410
411
flags = 0
411
412
if category in ["Lm" , "Lt" , "Lu" , "Ll" , "Lo" ]:
412
413
flags |= ALPHA_MASK
@@ -434,16 +435,16 @@ def makeunicodetype(unicode, trace):
434
435
flags |= CASE_IGNORABLE_MASK
435
436
sc = unicode .special_casing .get (char )
436
437
cf = unicode .case_folding .get (char , [char ])
437
- if record [ 12 ] :
438
- upper = int (record [ 12 ] , 16 )
438
+ if record . simple_uppercase_mapping :
439
+ upper = int (record . simple_uppercase_mapping , 16 )
439
440
else :
440
441
upper = char
441
- if record [ 13 ] :
442
- lower = int (record [ 13 ] , 16 )
442
+ if record . simple_lowercase_mapping :
443
+ lower = int (record . simple_lowercase_mapping , 16 )
443
444
else :
444
445
lower = char
445
- if record [ 14 ] :
446
- title = int (record [ 14 ] , 16 )
446
+ if record . simple_titlecase_mapping :
447
+ title = int (record . simple_titlecase_mapping , 16 )
447
448
else :
448
449
title = upper
449
450
if sc is None and cf != [lower ]:
@@ -480,16 +481,16 @@ def makeunicodetype(unicode, trace):
480
481
extra_casing .extend (sc [1 ])
481
482
# decimal digit, integer digit
482
483
decimal = 0
483
- if record [ 6 ] :
484
+ if record . decomposition_mapping :
484
485
flags |= DECIMAL_MASK
485
- decimal = int (record [ 6 ] )
486
+ decimal = int (record . decomposition_mapping )
486
487
digit = 0
487
- if record [ 7 ] :
488
+ if record . numeric_type :
488
489
flags |= DIGIT_MASK
489
- digit = int (record [ 7 ] )
490
- if record [ 8 ] :
490
+ digit = int (record . numeric_type )
491
+ if record . numeric_value :
491
492
flags |= NUMERIC_MASK
492
- numeric .setdefault (record [ 8 ] , []).append (char )
493
+ numeric .setdefault (record . numeric_value , []).append (char )
493
494
item = (
494
495
upper , lower , title , decimal , digit , flags
495
496
)
@@ -609,7 +610,7 @@ def makeunicodename(unicode, trace):
609
610
for char in unicode .chars :
610
611
record = unicode .table [char ]
611
612
if record :
612
- name = record [ 1 ] .strip ()
613
+ name = record . name .strip ()
613
614
if name and name [0 ] != "<" :
614
615
names [char ] = name + chr (0 )
615
616
@@ -719,7 +720,7 @@ def word_key(a):
719
720
for char in unicode .chars :
720
721
record = unicode .table [char ]
721
722
if record :
722
- name = record [ 1 ] .strip ()
723
+ name = record . name .strip ()
723
724
if name and name [0 ] != "<" :
724
725
data .append ((name , char ))
725
726
@@ -819,31 +820,27 @@ def merge_old_version(version, new, old):
819
820
continue
820
821
# check characters that differ
821
822
if old .table [i ] != new .table [i ]:
822
- for k in range (len (old .table [i ])):
823
- if old .table [i ][k ] != new .table [i ][k ]:
824
- value = old .table [i ][k ]
823
+ for k , field in enumerate (dataclasses .fields (UcdRecord )):
824
+ value = getattr (old .table [i ], field .name )
825
+ new_value = getattr (new .table [i ], field .name )
826
+ if value != new_value :
825
827
if k == 1 and i in PUA_15 :
826
828
# the name is not set in the old.table, but in the
827
829
# new.table we are using it for aliases and named seq
828
830
assert value == ''
829
831
elif k == 2 :
830
- #print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
831
832
category_changes [i ] = CATEGORY_NAMES .index (value )
832
833
elif k == 4 :
833
- #print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
834
834
bidir_changes [i ] = BIDIRECTIONAL_NAMES .index (value )
835
835
elif k == 5 :
836
- #print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
837
836
# We assume that all normalization changes are in 1:1 mappings
838
837
assert " " not in value
839
838
normalization_changes .append ((i , value ))
840
839
elif k == 6 :
841
- #print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
842
840
# we only support changes where the old value is a single digit
843
841
assert value in "0123456789"
844
842
decimal_changes [i ] = int (value )
845
843
elif k == 8 :
846
- # print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
847
844
# Since 0 encodes "no change", the old value is better not 0
848
845
if not value :
849
846
numeric_changes [i ] = - 1
@@ -952,25 +949,60 @@ def expanded(self) -> Iterator[Tuple[int, List[str]]]:
952
949
yield char , rest
953
950
954
951
952
+ @dataclasses .dataclass
953
+ class UcdRecord :
954
+ # 15 fields from UnicodeData.txt . See:
955
+ # https://www.unicode.org/reports/tr44/#UnicodeData.txt
956
+ codepoint : str
957
+ name : str
958
+ general_category : str
959
+ canonical_combining_class : str
960
+ bidi_class : str
961
+ decomposition_type : str
962
+ decomposition_mapping : str
963
+ numeric_type : str
964
+ numeric_value : str
965
+ bidi_mirrored : str
966
+ unicode_1_name : str # obsolete
967
+ iso_comment : str # obsolete
968
+ simple_uppercase_mapping : str
969
+ simple_lowercase_mapping : str
970
+ simple_titlecase_mapping : str
971
+
972
+ # https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
973
+ east_asian_width : Optional [str ]
974
+
975
+ # Binary properties, as a set of those that are true.
976
+ # Taken from multiple files:
977
+ # https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
978
+ # https://www.unicode.org/reports/tr44/#LineBreak.txt
979
+ binary_properties : Set [str ]
980
+
981
+ # The Quick_Check properties related to normalization:
982
+ # https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
983
+ # We store them as a bitmask.
984
+ quick_check : int
985
+
986
+
987
+ def from_row (row : List [str ]) -> UcdRecord :
988
+ return UcdRecord (* row , None , set (), 0 )
989
+
990
+
955
991
# --------------------------------------------------------------------
956
992
# the following support code is taken from the unidb utilities
957
993
# Copyright (c) 1999-2000 by Secret Labs AB
958
994
959
995
# load a unicode-data file from disk
960
996
961
997
class UnicodeData :
962
- # Record structure:
963
- # [ID, name, category, combining, bidi, decomp, (6)
964
- # decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
965
- # ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
966
- # derived-props] (17)
998
+ # table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
967
999
968
1000
def __init__ (self , version , cjk_check = True ):
969
1001
self .changed = []
970
1002
table = [None ] * 0x110000
971
1003
for s in UcdFile (UNICODE_DATA , version ):
972
1004
char = int (s [0 ], 16 )
973
- table [char ] = s
1005
+ table [char ] = from_row ( s )
974
1006
975
1007
cjk_ranges_found = []
976
1008
@@ -982,19 +1014,17 @@ def __init__(self, version, cjk_check=True):
982
1014
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
983
1015
s = table [i ]
984
1016
if s :
985
- if s [ 1 ] [- 6 :] == "First>" :
986
- s [ 1 ] = ""
987
- field = s
988
- elif s [ 1 ] [- 5 :] == "Last>" :
989
- if s [ 1 ] .startswith ("<CJK Ideograph" ):
1017
+ if s . name [- 6 :] == "First>" :
1018
+ s . name = ""
1019
+ field = dataclasses . astuple ( s )[: 15 ]
1020
+ elif s . name [- 5 :] == "Last>" :
1021
+ if s . name .startswith ("<CJK Ideograph" ):
990
1022
cjk_ranges_found .append ((field [0 ],
991
- s [ 0 ] ))
992
- s [ 1 ] = ""
1023
+ s . codepoint ))
1024
+ s . name = ""
993
1025
field = None
994
1026
elif field :
995
- f2 = field [:]
996
- f2 [0 ] = "%X" % i
997
- table [i ] = f2
1027
+ table [i ] = from_row (('%X' % i ,) + field [1 :])
998
1028
if cjk_check and cjk_ranges != cjk_ranges_found :
999
1029
raise ValueError ("CJK ranges deviate: have %r" % cjk_ranges_found )
1000
1030
@@ -1015,7 +1045,7 @@ def __init__(self, version, cjk_check=True):
1015
1045
char = int (char , 16 )
1016
1046
self .aliases .append ((name , char ))
1017
1047
# also store the name in the PUA 1
1018
- self .table [pua_index ][ 1 ] = name
1048
+ self .table [pua_index ]. name = name
1019
1049
pua_index += 1
1020
1050
assert pua_index - NAME_ALIASES_START == len (self .aliases )
1021
1051
@@ -1034,7 +1064,7 @@ def __init__(self, version, cjk_check=True):
1034
1064
"the NamedSequence struct and in unicodedata_lookup" )
1035
1065
self .named_sequences .append ((name , chars ))
1036
1066
# also store these in the PUA 1
1037
- self .table [pua_index ][ 1 ] = name
1067
+ self .table [pua_index ]. name = name
1038
1068
pua_index += 1
1039
1069
assert pua_index - NAMED_SEQUENCES_START == len (self .named_sequences )
1040
1070
@@ -1049,23 +1079,19 @@ def __init__(self, version, cjk_check=True):
1049
1079
1050
1080
for i in range (0 , 0x110000 ):
1051
1081
if table [i ] is not None :
1052
- table [i ].append (widths [i ])
1053
-
1054
- for i in range (0 , 0x110000 ):
1055
- if table [i ] is not None :
1056
- table [i ].append (set ())
1082
+ table [i ].east_asian_width = widths [i ]
1057
1083
1058
1084
for char , (p ,) in UcdFile (DERIVED_CORE_PROPERTIES , version ).expanded ():
1059
1085
if table [char ]:
1060
1086
# Some properties (e.g. Default_Ignorable_Code_Point)
1061
1087
# apply to unassigned code points; ignore them
1062
- table [char ][ - 1 ] .add (p )
1088
+ table [char ]. binary_properties .add (p )
1063
1089
1064
1090
for char_range , value in UcdFile (LINE_BREAK , version ):
1065
1091
if value not in MANDATORY_LINE_BREAKS :
1066
1092
continue
1067
1093
for char in expand_range (char_range ):
1068
- table [char ][ - 1 ] .add ('Line_Break' )
1094
+ table [char ]. binary_properties .add ('Line_Break' )
1069
1095
1070
1096
# We only want the quickcheck properties
1071
1097
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1087,7 +1113,7 @@ def __init__(self, version, cjk_check=True):
1087
1113
quickchecks [char ] |= quickcheck
1088
1114
for i in range (0 , 0x110000 ):
1089
1115
if table [i ] is not None :
1090
- table [i ].append ( quickchecks [i ])
1116
+ table [i ].quick_check = quickchecks [i ]
1091
1117
1092
1118
with open_data (UNIHAN , version ) as file :
1093
1119
zip = zipfile .ZipFile (file )
@@ -1106,7 +1132,7 @@ def __init__(self, version, cjk_check=True):
1106
1132
i = int (code [2 :], 16 )
1107
1133
# Patch the numeric field
1108
1134
if table [i ] is not None :
1109
- table [i ][ 8 ] = value
1135
+ table [i ]. numeric_value = value
1110
1136
1111
1137
sc = self .special_casing = {}
1112
1138
for data in UcdFile (SPECIAL_CASING , version ):
0 commit comments