30
30
import sys
31
31
import zipfile
32
32
33
- from textwrap import dedent
34
33
from functools import partial
34
+ from textwrap import dedent
35
+ from typing import *
35
36
36
37
SCRIPT = sys .argv [0 ]
37
38
VERSION = "3.3"
@@ -903,6 +904,32 @@ def open_data(template, version):
903
904
return open (local , 'rb' )
904
905
905
906
907
+ class UcdFile :
908
+ '''
909
+ A file in the standard format of the UCD.
910
+
911
+ See: https://www.unicode.org/reports/tr44/#Format_Conventions
912
+
913
+ Note that, as described there, the Unihan data files have their
914
+ own separate format.
915
+ '''
916
+
917
+ def __init__ (self , template : str , version : str ) -> None :
918
+ self .template = template
919
+ self .version = version
920
+
921
+ def records (self ) -> Iterator [List [str ]]:
922
+ with open_data (self .template , self .version ) as file :
923
+ for line in file :
924
+ line = line .split ('#' , 1 )[0 ].strip ()
925
+ if not line :
926
+ continue
927
+ yield [field .strip () for field in line .split (';' )]
928
+
929
+ def __iter__ (self ) -> Iterator [List [str ]]:
930
+ return self .records ()
931
+
932
+
906
933
# --------------------------------------------------------------------
907
934
# the following support code is taken from the unidb utilities
908
935
# Copyright (c) 1999-2000 by Secret Labs AB
@@ -922,14 +949,9 @@ def __init__(self, version,
922
949
cjk_check = True ):
923
950
self .changed = []
924
951
table = [None ] * 0x110000
925
- with open_data (UNICODE_DATA , version ) as file :
926
- while 1 :
927
- s = file .readline ()
928
- if not s :
929
- break
930
- s = s .strip ().split (";" )
931
- char = int (s [0 ], 16 )
932
- table [char ] = s
952
+ for s in UcdFile (UNICODE_DATA , version ):
953
+ char = int (s [0 ], 16 )
954
+ table [char ] = s
933
955
934
956
cjk_ranges_found = []
935
957
@@ -968,17 +990,12 @@ def __init__(self, version,
968
990
# in order to take advantage of the compression and lookup
969
991
# algorithms used for the other characters
970
992
pua_index = NAME_ALIASES_START
971
- with open_data (NAME_ALIASES , version ) as file :
972
- for s in file :
973
- s = s .strip ()
974
- if not s or s .startswith ('#' ):
975
- continue
976
- char , name , abbrev = s .split (';' )
977
- char = int (char , 16 )
978
- self .aliases .append ((name , char ))
979
- # also store the name in the PUA 1
980
- self .table [pua_index ][1 ] = name
981
- pua_index += 1
993
+ for char , name , abbrev in UcdFile (NAME_ALIASES , version ):
994
+ char = int (char , 16 )
995
+ self .aliases .append ((name , char ))
996
+ # also store the name in the PUA 1
997
+ self .table [pua_index ][1 ] = name
998
+ pua_index += 1
982
999
assert pua_index - NAME_ALIASES_START == len (self .aliases )
983
1000
984
1001
self .named_sequences = []
@@ -988,50 +1005,32 @@ def __init__(self, version,
988
1005
989
1006
assert pua_index < NAMED_SEQUENCES_START
990
1007
pua_index = NAMED_SEQUENCES_START
991
- with open_data (NAMED_SEQUENCES , version ) as file :
992
- for s in file :
993
- s = s .strip ()
994
- if not s or s .startswith ('#' ):
995
- continue
996
- name , chars = s .split (';' )
997
- chars = tuple (int (char , 16 ) for char in chars .split ())
998
- # check that the structure defined in makeunicodename is OK
999
- assert 2 <= len (chars ) <= 4 , "change the Py_UCS2 array size"
1000
- assert all (c <= 0xFFFF for c in chars ), ("use Py_UCS4 in "
1001
- "the NamedSequence struct and in unicodedata_lookup" )
1002
- self .named_sequences .append ((name , chars ))
1003
- # also store these in the PUA 1
1004
- self .table [pua_index ][1 ] = name
1005
- pua_index += 1
1008
+ for name , chars in UcdFile (NAMED_SEQUENCES , version ):
1009
+ chars = tuple (int (char , 16 ) for char in chars .split ())
1010
+ # check that the structure defined in makeunicodename is OK
1011
+ assert 2 <= len (chars ) <= 4 , "change the Py_UCS2 array size"
1012
+ assert all (c <= 0xFFFF for c in chars ), ("use Py_UCS4 in "
1013
+ "the NamedSequence struct and in unicodedata_lookup" )
1014
+ self .named_sequences .append ((name , chars ))
1015
+ # also store these in the PUA 1
1016
+ self .table [pua_index ][1 ] = name
1017
+ pua_index += 1
1006
1018
assert pua_index - NAMED_SEQUENCES_START == len (self .named_sequences )
1007
1019
1008
1020
self .exclusions = {}
1009
- with open_data (COMPOSITION_EXCLUSIONS , version ) as file :
1010
- for s in file :
1011
- s = s .strip ()
1012
- if not s :
1013
- continue
1014
- if s [0 ] == '#' :
1015
- continue
1016
- char = int (s .split ()[0 ],16 )
1017
- self .exclusions [char ] = 1
1021
+ for char , in UcdFile (COMPOSITION_EXCLUSIONS , version ):
1022
+ char = int (char , 16 )
1023
+ self .exclusions [char ] = 1
1018
1024
1019
1025
widths = [None ] * 0x110000
1020
- with open_data (EASTASIAN_WIDTH , version ) as file :
1021
- for s in file :
1022
- s = s .strip ()
1023
- if not s :
1024
- continue
1025
- if s [0 ] == '#' :
1026
- continue
1027
- s = s .split ()[0 ].split (';' )
1028
- if '..' in s [0 ]:
1029
- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1030
- chars = list (range (first , last + 1 ))
1031
- else :
1032
- chars = [int (s [0 ], 16 )]
1033
- for char in chars :
1034
- widths [char ] = s [1 ]
1026
+ for s in UcdFile (EASTASIAN_WIDTH , version ):
1027
+ if '..' in s [0 ]:
1028
+ first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1029
+ chars = list (range (first , last + 1 ))
1030
+ else :
1031
+ chars = [int (s [0 ], 16 )]
1032
+ for char in chars :
1033
+ widths [char ] = s [1 ]
1035
1034
1036
1035
for i in range (0 , 0x110000 ):
1037
1036
if table [i ] is not None :
@@ -1041,38 +1040,27 @@ def __init__(self, version,
1041
1040
if table [i ] is not None :
1042
1041
table [i ].append (set ())
1043
1042
1044
- with open_data (DERIVED_CORE_PROPERTIES , version ) as file :
1045
- for s in file :
1046
- s = s .split ('#' , 1 )[0 ].strip ()
1047
- if not s :
1048
- continue
1049
-
1050
- r , p = s .split (";" )
1051
- r = r .strip ()
1052
- p = p .strip ()
1053
- if ".." in r :
1054
- first , last = [int (c , 16 ) for c in r .split ('..' )]
1055
- chars = list (range (first , last + 1 ))
1056
- else :
1057
- chars = [int (r , 16 )]
1058
- for char in chars :
1059
- if table [char ]:
1060
- # Some properties (e.g. Default_Ignorable_Code_Point)
1061
- # apply to unassigned code points; ignore them
1062
- table [char ][- 1 ].add (p )
1063
-
1064
- with open_data (LINE_BREAK , version ) as file :
1065
- for s in file :
1066
- s = s .partition ('#' )[0 ]
1067
- s = [i .strip () for i in s .split (';' )]
1068
- if len (s ) < 2 or s [1 ] not in MANDATORY_LINE_BREAKS :
1069
- continue
1070
- if '..' not in s [0 ]:
1071
- first = last = int (s [0 ], 16 )
1072
- else :
1073
- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1074
- for char in range (first , last + 1 ):
1075
- table [char ][- 1 ].add ('Line_Break' )
1043
+ for r , p in UcdFile (DERIVED_CORE_PROPERTIES , version ):
1044
+ if ".." in r :
1045
+ first , last = [int (c , 16 ) for c in r .split ('..' )]
1046
+ chars = list (range (first , last + 1 ))
1047
+ else :
1048
+ chars = [int (r , 16 )]
1049
+ for char in chars :
1050
+ if table [char ]:
1051
+ # Some properties (e.g. Default_Ignorable_Code_Point)
1052
+ # apply to unassigned code points; ignore them
1053
+ table [char ][- 1 ].add (p )
1054
+
1055
+ for s in UcdFile (LINE_BREAK , version ):
1056
+ if len (s ) < 2 or s [1 ] not in MANDATORY_LINE_BREAKS :
1057
+ continue
1058
+ if '..' not in s [0 ]:
1059
+ first = last = int (s [0 ], 16 )
1060
+ else :
1061
+ first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1062
+ for char in range (first , last + 1 ):
1063
+ table [char ][- 1 ].add ('Line_Break' )
1076
1064
1077
1065
# We only want the quickcheck properties
1078
1066
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1083,23 +1071,19 @@ def __init__(self, version,
1083
1071
# for older versions, and no delta records will be created.
1084
1072
quickchecks = [0 ] * 0x110000
1085
1073
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC' .split ()
1086
- with open_data (DERIVEDNORMALIZATION_PROPS , version ) as file :
1087
- for s in file :
1088
- if '#' in s :
1089
- s = s [:s .index ('#' )]
1090
- s = [i .strip () for i in s .split (';' )]
1091
- if len (s ) < 2 or s [1 ] not in qc_order :
1092
- continue
1093
- quickcheck = 'MN' .index (s [2 ]) + 1 # Maybe or No
1094
- quickcheck_shift = qc_order .index (s [1 ])* 2
1095
- quickcheck <<= quickcheck_shift
1096
- if '..' not in s [0 ]:
1097
- first = last = int (s [0 ], 16 )
1098
- else :
1099
- first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1100
- for char in range (first , last + 1 ):
1101
- assert not (quickchecks [char ]>> quickcheck_shift )& 3
1102
- quickchecks [char ] |= quickcheck
1074
+ for s in UcdFile (DERIVEDNORMALIZATION_PROPS , version ):
1075
+ if len (s ) < 2 or s [1 ] not in qc_order :
1076
+ continue
1077
+ quickcheck = 'MN' .index (s [2 ]) + 1 # Maybe or No
1078
+ quickcheck_shift = qc_order .index (s [1 ])* 2
1079
+ quickcheck <<= quickcheck_shift
1080
+ if '..' not in s [0 ]:
1081
+ first = last = int (s [0 ], 16 )
1082
+ else :
1083
+ first , last = [int (c , 16 ) for c in s [0 ].split ('..' )]
1084
+ for char in range (first , last + 1 ):
1085
+ assert not (quickchecks [char ]>> quickcheck_shift )& 3
1086
+ quickchecks [char ] |= quickcheck
1103
1087
for i in range (0 , 0x110000 ):
1104
1088
if table [i ] is not None :
1105
1089
table [i ].append (quickchecks [i ])
@@ -1122,34 +1106,26 @@ def __init__(self, version,
1122
1106
# Patch the numeric field
1123
1107
if table [i ] is not None :
1124
1108
table [i ][8 ] = value
1109
+
1125
1110
sc = self .special_casing = {}
1126
- with open_data (SPECIAL_CASING , version ) as file :
1127
- for s in file :
1128
- s = s [:- 1 ].split ('#' , 1 )[0 ]
1129
- if not s :
1130
- continue
1131
- data = s .split ("; " )
1132
- if data [4 ]:
1133
- # We ignore all conditionals (since they depend on
1134
- # languages) except for one, which is hardcoded. See
1135
- # handle_capital_sigma in unicodeobject.c.
1136
- continue
1137
- c = int (data [0 ], 16 )
1138
- lower = [int (char , 16 ) for char in data [1 ].split ()]
1139
- title = [int (char , 16 ) for char in data [2 ].split ()]
1140
- upper = [int (char , 16 ) for char in data [3 ].split ()]
1141
- sc [c ] = (lower , title , upper )
1111
+ for data in UcdFile (SPECIAL_CASING , version ):
1112
+ if data [4 ]:
1113
+ # We ignore all conditionals (since they depend on
1114
+ # languages) except for one, which is hardcoded. See
1115
+ # handle_capital_sigma in unicodeobject.c.
1116
+ continue
1117
+ c = int (data [0 ], 16 )
1118
+ lower = [int (char , 16 ) for char in data [1 ].split ()]
1119
+ title = [int (char , 16 ) for char in data [2 ].split ()]
1120
+ upper = [int (char , 16 ) for char in data [3 ].split ()]
1121
+ sc [c ] = (lower , title , upper )
1122
+
1142
1123
cf = self .case_folding = {}
1143
1124
if version != '3.2.0' :
1144
- with open_data (CASE_FOLDING , version ) as file :
1145
- for s in file :
1146
- s = s [:- 1 ].split ('#' , 1 )[0 ]
1147
- if not s :
1148
- continue
1149
- data = s .split ("; " )
1150
- if data [1 ] in "CF" :
1151
- c = int (data [0 ], 16 )
1152
- cf [c ] = [int (char , 16 ) for char in data [2 ].split ()]
1125
+ for data in UcdFile (CASE_FOLDING , version ):
1126
+ if data [1 ] in "CF" :
1127
+ c = int (data [0 ], 16 )
1128
+ cf [c ] = [int (char , 16 ) for char in data [2 ].split ()]
1153
1129
1154
1130
def uselatin1 (self ):
1155
1131
# restrict character range to ISO Latin 1
0 commit comments