@@ -84,8 +84,8 @@ def fetch(f):
84
84
sys .stderr .write ("cannot load %s" % f )
85
85
exit (1 )
86
86
87
- def is_valid_unicode (n ):
88
- return 0 <= n <= 0xD7FF or 0xE000 <= n <= 0x10FFFF
87
+ def is_surrogate (n ):
88
+ return 0xD800 <= n <= 0xDFFF
89
89
90
90
def load_unicode_data (f ):
91
91
fetch (f )
@@ -96,19 +96,28 @@ def load_unicode_data(f):
96
96
canon_decomp = {}
97
97
compat_decomp = {}
98
98
99
+ udict = {};
100
+ range_start = - 1 ;
99
101
for line in fileinput .input (f ):
100
- fields = line .split (";" )
101
- if len (fields ) != 15 :
102
+ data = line .split (';' );
103
+ if len (data ) != 15 :
102
104
continue
103
- [code , name , gencat , combine , bidi ,
104
- decomp , deci , digit , num , mirror ,
105
- old , iso , upcase , lowcase , titlecase ] = fields
106
-
107
- code_org = code
108
- code = int (code , 16 )
109
-
110
- if not is_valid_unicode (code ):
105
+ cp = int (data [0 ], 16 );
106
+ if is_surrogate (cp ):
111
107
continue
108
+ if range_start >= 0 :
109
+ for i in xrange (range_start , cp ):
110
+ udict [i ] = data ;
111
+ range_start = - 1 ;
112
+ if data [1 ].endswith (", First>" ):
113
+ range_start = cp ;
114
+ continue ;
115
+ udict [cp ] = data ;
116
+
117
+ for code in udict :
118
+ [code_org , name , gencat , combine , bidi ,
119
+ decomp , deci , digit , num , mirror ,
120
+ old , iso , upcase , lowcase , titlecase ] = udict [code ];
112
121
113
122
# generate char to char direct common and simple conversions
114
123
# uppercase to lowercase
0 commit comments