Skip to content

Commit a65678c

Browse files
gnpricebenjaminp
authored andcommitted
bpo-37760: Convert from length-18 lists to a dataclass, in makeunicodedata. (GH-15265)
Now the fields have names! Much easier to keep straight as a reader than the elements of an 18-tuple. Runs about 10-15% slower: from 10.8s to 12.3s, on my laptop. Fortunately that's perfectly fine for this maintenance script.
1 parent 5e9caee commit a65678c

File tree

2 files changed

+94
-62
lines changed

2 files changed

+94
-62
lines changed
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
The :file:`Tools/unicode/makeunicodedata.py` script, which is used for
2+
converting information from the Unicode Character Database into generated
3+
code and data used by the methods of :class:`str` and by the
4+
:mod:`unicodedata` module, now handles each character's data as a
5+
``dataclass`` with named attributes, rather than a length-18 list of
6+
different fields.

Tools/unicode/makeunicodedata.py

Lines changed: 88 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -26,13 +26,14 @@
2626
# written by Fredrik Lundh ([email protected])
2727
#
2828

29+
import dataclasses
2930
import os
3031
import sys
3132
import zipfile
3233

3334
from functools import partial
3435
from textwrap import dedent
35-
from typing import Iterator, List, Tuple
36+
from typing import Iterator, List, Optional, Set, Tuple
3637

3738
SCRIPT = sys.argv[0]
3839
VERSION = "3.3"
@@ -148,12 +149,12 @@ def makeunicodedata(unicode, trace):
148149
record = unicode.table[char]
149150
if record:
150151
# extract database properties
151-
category = CATEGORY_NAMES.index(record[2])
152-
combining = int(record[3])
153-
bidirectional = BIDIRECTIONAL_NAMES.index(record[4])
154-
mirrored = record[9] == "Y"
155-
eastasianwidth = EASTASIANWIDTH_NAMES.index(record[15])
156-
normalizationquickcheck = record[17]
152+
category = CATEGORY_NAMES.index(record.general_category)
153+
combining = int(record.canonical_combining_class)
154+
bidirectional = BIDIRECTIONAL_NAMES.index(record.bidi_class)
155+
mirrored = record.bidi_mirrored == "Y"
156+
eastasianwidth = EASTASIANWIDTH_NAMES.index(record.east_asian_width)
157+
normalizationquickcheck = record.quick_check
157158
item = (
158159
category, combining, bidirectional, mirrored, eastasianwidth,
159160
normalizationquickcheck
@@ -179,8 +180,8 @@ def makeunicodedata(unicode, trace):
179180
for char in unicode.chars:
180181
record = unicode.table[char]
181182
if record:
182-
if record[5]:
183-
decomp = record[5].split()
183+
if record.decomposition_type:
184+
decomp = record.decomposition_type.split()
184185
if len(decomp) > 19:
185186
raise Exception("character %x has a decomposition too large for nfd_nfkd" % char)
186187
# prefix
@@ -200,7 +201,7 @@ def makeunicodedata(unicode, trace):
200201
# Collect NFC pairs
201202
if not prefix and len(decomp) == 3 and \
202203
char not in unicode.exclusions and \
203-
unicode.table[decomp[1]][3] == "0":
204+
unicode.table[decomp[1]].canonical_combining_class == "0":
204205
p, l, r = decomp
205206
comp_first[l] = 1
206207
comp_last[r] = 1
@@ -404,9 +405,9 @@ def makeunicodetype(unicode, trace):
404405
record = unicode.table[char]
405406
if record:
406407
# extract database properties
407-
category = record[2]
408-
bidirectional = record[4]
409-
properties = record[16]
408+
category = record.general_category
409+
bidirectional = record.bidi_class
410+
properties = record.binary_properties
410411
flags = 0
411412
if category in ["Lm", "Lt", "Lu", "Ll", "Lo"]:
412413
flags |= ALPHA_MASK
@@ -434,16 +435,16 @@ def makeunicodetype(unicode, trace):
434435
flags |= CASE_IGNORABLE_MASK
435436
sc = unicode.special_casing.get(char)
436437
cf = unicode.case_folding.get(char, [char])
437-
if record[12]:
438-
upper = int(record[12], 16)
438+
if record.simple_uppercase_mapping:
439+
upper = int(record.simple_uppercase_mapping, 16)
439440
else:
440441
upper = char
441-
if record[13]:
442-
lower = int(record[13], 16)
442+
if record.simple_lowercase_mapping:
443+
lower = int(record.simple_lowercase_mapping, 16)
443444
else:
444445
lower = char
445-
if record[14]:
446-
title = int(record[14], 16)
446+
if record.simple_titlecase_mapping:
447+
title = int(record.simple_titlecase_mapping, 16)
447448
else:
448449
title = upper
449450
if sc is None and cf != [lower]:
@@ -480,16 +481,16 @@ def makeunicodetype(unicode, trace):
480481
extra_casing.extend(sc[1])
481482
# decimal digit, integer digit
482483
decimal = 0
483-
if record[6]:
484+
if record.decomposition_mapping:
484485
flags |= DECIMAL_MASK
485-
decimal = int(record[6])
486+
decimal = int(record.decomposition_mapping)
486487
digit = 0
487-
if record[7]:
488+
if record.numeric_type:
488489
flags |= DIGIT_MASK
489-
digit = int(record[7])
490-
if record[8]:
490+
digit = int(record.numeric_type)
491+
if record.numeric_value:
491492
flags |= NUMERIC_MASK
492-
numeric.setdefault(record[8], []).append(char)
493+
numeric.setdefault(record.numeric_value, []).append(char)
493494
item = (
494495
upper, lower, title, decimal, digit, flags
495496
)
@@ -609,7 +610,7 @@ def makeunicodename(unicode, trace):
609610
for char in unicode.chars:
610611
record = unicode.table[char]
611612
if record:
612-
name = record[1].strip()
613+
name = record.name.strip()
613614
if name and name[0] != "<":
614615
names[char] = name + chr(0)
615616

@@ -719,7 +720,7 @@ def word_key(a):
719720
for char in unicode.chars:
720721
record = unicode.table[char]
721722
if record:
722-
name = record[1].strip()
723+
name = record.name.strip()
723724
if name and name[0] != "<":
724725
data.append((name, char))
725726

@@ -819,31 +820,27 @@ def merge_old_version(version, new, old):
819820
continue
820821
# check characters that differ
821822
if old.table[i] != new.table[i]:
822-
for k in range(len(old.table[i])):
823-
if old.table[i][k] != new.table[i][k]:
824-
value = old.table[i][k]
823+
for k, field in enumerate(dataclasses.fields(UcdRecord)):
824+
value = getattr(old.table[i], field.name)
825+
new_value = getattr(new.table[i], field.name)
826+
if value != new_value:
825827
if k == 1 and i in PUA_15:
826828
# the name is not set in the old.table, but in the
827829
# new.table we are using it for aliases and named seq
828830
assert value == ''
829831
elif k == 2:
830-
#print "CATEGORY",hex(i), old.table[i][k], new.table[i][k]
831832
category_changes[i] = CATEGORY_NAMES.index(value)
832833
elif k == 4:
833-
#print "BIDIR",hex(i), old.table[i][k], new.table[i][k]
834834
bidir_changes[i] = BIDIRECTIONAL_NAMES.index(value)
835835
elif k == 5:
836-
#print "DECOMP",hex(i), old.table[i][k], new.table[i][k]
837836
# We assume that all normalization changes are in 1:1 mappings
838837
assert " " not in value
839838
normalization_changes.append((i, value))
840839
elif k == 6:
841-
#print "DECIMAL",hex(i), old.table[i][k], new.table[i][k]
842840
# we only support changes where the old value is a single digit
843841
assert value in "0123456789"
844842
decimal_changes[i] = int(value)
845843
elif k == 8:
846-
# print "NUMERIC",hex(i), `old.table[i][k]`, new.table[i][k]
847844
# Since 0 encodes "no change", the old value is better not 0
848845
if not value:
849846
numeric_changes[i] = -1
@@ -952,25 +949,60 @@ def expanded(self) -> Iterator[Tuple[int, List[str]]]:
952949
yield char, rest
953950

954951

952+
@dataclasses.dataclass
953+
class UcdRecord:
954+
# 15 fields from UnicodeData.txt . See:
955+
# https://www.unicode.org/reports/tr44/#UnicodeData.txt
956+
codepoint: str
957+
name: str
958+
general_category: str
959+
canonical_combining_class: str
960+
bidi_class: str
961+
decomposition_type: str
962+
decomposition_mapping: str
963+
numeric_type: str
964+
numeric_value: str
965+
bidi_mirrored: str
966+
unicode_1_name: str # obsolete
967+
iso_comment: str # obsolete
968+
simple_uppercase_mapping: str
969+
simple_lowercase_mapping: str
970+
simple_titlecase_mapping: str
971+
972+
# https://www.unicode.org/reports/tr44/#EastAsianWidth.txt
973+
east_asian_width: Optional[str]
974+
975+
# Binary properties, as a set of those that are true.
976+
# Taken from multiple files:
977+
# https://www.unicode.org/reports/tr44/#DerivedCoreProperties.txt
978+
# https://www.unicode.org/reports/tr44/#LineBreak.txt
979+
binary_properties: Set[str]
980+
981+
# The Quick_Check properties related to normalization:
982+
# https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization
983+
# We store them as a bitmask.
984+
quick_check: int
985+
986+
987+
def from_row(row: List[str]) -> UcdRecord:
988+
return UcdRecord(*row, None, set(), 0)
989+
990+
955991
# --------------------------------------------------------------------
956992
# the following support code is taken from the unidb utilities
957993
# Copyright (c) 1999-2000 by Secret Labs AB
958994

959995
# load a unicode-data file from disk
960996

961997
class UnicodeData:
962-
# Record structure:
963-
# [ID, name, category, combining, bidi, decomp, (6)
964-
# decimal, digit, numeric, bidi-mirrored, Unicode-1-name, (11)
965-
# ISO-comment, uppercase, lowercase, titlecase, ea-width, (16)
966-
# derived-props] (17)
998+
# table: List[Optional[UcdRecord]] # index is codepoint; None means unassigned
967999

9681000
def __init__(self, version, cjk_check=True):
9691001
self.changed = []
9701002
table = [None] * 0x110000
9711003
for s in UcdFile(UNICODE_DATA, version):
9721004
char = int(s[0], 16)
973-
table[char] = s
1005+
table[char] = from_row(s)
9741006

9751007
cjk_ranges_found = []
9761008

@@ -982,19 +1014,17 @@ def __init__(self, version, cjk_check=True):
9821014
# https://www.unicode.org/reports/tr44/#Code_Point_Ranges
9831015
s = table[i]
9841016
if s:
985-
if s[1][-6:] == "First>":
986-
s[1] = ""
987-
field = s
988-
elif s[1][-5:] == "Last>":
989-
if s[1].startswith("<CJK Ideograph"):
1017+
if s.name[-6:] == "First>":
1018+
s.name = ""
1019+
field = dataclasses.astuple(s)[:15]
1020+
elif s.name[-5:] == "Last>":
1021+
if s.name.startswith("<CJK Ideograph"):
9901022
cjk_ranges_found.append((field[0],
991-
s[0]))
992-
s[1] = ""
1023+
s.codepoint))
1024+
s.name = ""
9931025
field = None
9941026
elif field:
995-
f2 = field[:]
996-
f2[0] = "%X" % i
997-
table[i] = f2
1027+
table[i] = from_row(('%X' % i,) + field[1:])
9981028
if cjk_check and cjk_ranges != cjk_ranges_found:
9991029
raise ValueError("CJK ranges deviate: have %r" % cjk_ranges_found)
10001030

@@ -1015,7 +1045,7 @@ def __init__(self, version, cjk_check=True):
10151045
char = int(char, 16)
10161046
self.aliases.append((name, char))
10171047
# also store the name in the PUA 1
1018-
self.table[pua_index][1] = name
1048+
self.table[pua_index].name = name
10191049
pua_index += 1
10201050
assert pua_index - NAME_ALIASES_START == len(self.aliases)
10211051

@@ -1034,7 +1064,7 @@ def __init__(self, version, cjk_check=True):
10341064
"the NamedSequence struct and in unicodedata_lookup")
10351065
self.named_sequences.append((name, chars))
10361066
# also store these in the PUA 1
1037-
self.table[pua_index][1] = name
1067+
self.table[pua_index].name = name
10381068
pua_index += 1
10391069
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
10401070

@@ -1049,23 +1079,19 @@ def __init__(self, version, cjk_check=True):
10491079

10501080
for i in range(0, 0x110000):
10511081
if table[i] is not None:
1052-
table[i].append(widths[i])
1053-
1054-
for i in range(0, 0x110000):
1055-
if table[i] is not None:
1056-
table[i].append(set())
1082+
table[i].east_asian_width = widths[i]
10571083

10581084
for char, (p,) in UcdFile(DERIVED_CORE_PROPERTIES, version).expanded():
10591085
if table[char]:
10601086
# Some properties (e.g. Default_Ignorable_Code_Point)
10611087
# apply to unassigned code points; ignore them
1062-
table[char][-1].add(p)
1088+
table[char].binary_properties.add(p)
10631089

10641090
for char_range, value in UcdFile(LINE_BREAK, version):
10651091
if value not in MANDATORY_LINE_BREAKS:
10661092
continue
10671093
for char in expand_range(char_range):
1068-
table[char][-1].add('Line_Break')
1094+
table[char].binary_properties.add('Line_Break')
10691095

10701096
# We only want the quickcheck properties
10711097
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1087,7 +1113,7 @@ def __init__(self, version, cjk_check=True):
10871113
quickchecks[char] |= quickcheck
10881114
for i in range(0, 0x110000):
10891115
if table[i] is not None:
1090-
table[i].append(quickchecks[i])
1116+
table[i].quick_check = quickchecks[i]
10911117

10921118
with open_data(UNIHAN, version) as file:
10931119
zip = zipfile.ZipFile(file)
@@ -1106,7 +1132,7 @@ def __init__(self, version, cjk_check=True):
11061132
i = int(code[2:], 16)
11071133
# Patch the numeric field
11081134
if table[i] is not None:
1109-
table[i][8] = value
1135+
table[i].numeric_value = value
11101136

11111137
sc = self.special_casing = {}
11121138
for data in UcdFile(SPECIAL_CASING, version):

0 commit comments

Comments
 (0)