Skip to content

Commit ef2af1a

Browse files
gnpricebenjaminp
authored andcommitted
bpo-37760: Factor out the basic UCD parsing logic of makeunicodedata. (pythonGH-15130)
There were 10 copies of this, and almost as many distinct versions of exactly how it was written. They're all implementing the same standard. Pull them out to the top, so the more interesting logic that remains becomes easier to read.
1 parent 66a34d3 commit ef2af1a

File tree

1 file changed

+109
-133
lines changed

1 file changed

+109
-133
lines changed

Tools/unicode/makeunicodedata.py

Lines changed: 109 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,9 @@
3030
import sys
3131
import zipfile
3232

33-
from textwrap import dedent
3433
from functools import partial
34+
from textwrap import dedent
35+
from typing import *
3536

3637
SCRIPT = sys.argv[0]
3738
VERSION = "3.3"
@@ -903,6 +904,32 @@ def open_data(template, version):
903904
return open(local, 'rb')
904905

905906

907+
class UcdFile:
908+
'''
909+
A file in the standard format of the UCD.
910+
911+
See: https://www.unicode.org/reports/tr44/#Format_Conventions
912+
913+
Note that, as described there, the Unihan data files have their
914+
own separate format.
915+
'''
916+
917+
def __init__(self, template: str, version: str) -> None:
918+
self.template = template
919+
self.version = version
920+
921+
def records(self) -> Iterator[List[str]]:
922+
with open_data(self.template, self.version) as file:
923+
for line in file:
924+
line = line.split('#', 1)[0].strip()
925+
if not line:
926+
continue
927+
yield [field.strip() for field in line.split(';')]
928+
929+
def __iter__(self) -> Iterator[List[str]]:
930+
return self.records()
931+
932+
906933
# --------------------------------------------------------------------
907934
# the following support code is taken from the unidb utilities
908935
# Copyright (c) 1999-2000 by Secret Labs AB
@@ -922,14 +949,9 @@ def __init__(self, version,
922949
cjk_check=True):
923950
self.changed = []
924951
table = [None] * 0x110000
925-
with open_data(UNICODE_DATA, version) as file:
926-
while 1:
927-
s = file.readline()
928-
if not s:
929-
break
930-
s = s.strip().split(";")
931-
char = int(s[0], 16)
932-
table[char] = s
952+
for s in UcdFile(UNICODE_DATA, version):
953+
char = int(s[0], 16)
954+
table[char] = s
933955

934956
cjk_ranges_found = []
935957

@@ -968,17 +990,12 @@ def __init__(self, version,
968990
# in order to take advantage of the compression and lookup
969991
# algorithms used for the other characters
970992
pua_index = NAME_ALIASES_START
971-
with open_data(NAME_ALIASES, version) as file:
972-
for s in file:
973-
s = s.strip()
974-
if not s or s.startswith('#'):
975-
continue
976-
char, name, abbrev = s.split(';')
977-
char = int(char, 16)
978-
self.aliases.append((name, char))
979-
# also store the name in the PUA 1
980-
self.table[pua_index][1] = name
981-
pua_index += 1
993+
for char, name, abbrev in UcdFile(NAME_ALIASES, version):
994+
char = int(char, 16)
995+
self.aliases.append((name, char))
996+
# also store the name in the PUA 1
997+
self.table[pua_index][1] = name
998+
pua_index += 1
982999
assert pua_index - NAME_ALIASES_START == len(self.aliases)
9831000

9841001
self.named_sequences = []
@@ -988,50 +1005,32 @@ def __init__(self, version,
9881005

9891006
assert pua_index < NAMED_SEQUENCES_START
9901007
pua_index = NAMED_SEQUENCES_START
991-
with open_data(NAMED_SEQUENCES, version) as file:
992-
for s in file:
993-
s = s.strip()
994-
if not s or s.startswith('#'):
995-
continue
996-
name, chars = s.split(';')
997-
chars = tuple(int(char, 16) for char in chars.split())
998-
# check that the structure defined in makeunicodename is OK
999-
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1000-
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1001-
"the NamedSequence struct and in unicodedata_lookup")
1002-
self.named_sequences.append((name, chars))
1003-
# also store these in the PUA 1
1004-
self.table[pua_index][1] = name
1005-
pua_index += 1
1008+
for name, chars in UcdFile(NAMED_SEQUENCES, version):
1009+
chars = tuple(int(char, 16) for char in chars.split())
1010+
# check that the structure defined in makeunicodename is OK
1011+
assert 2 <= len(chars) <= 4, "change the Py_UCS2 array size"
1012+
assert all(c <= 0xFFFF for c in chars), ("use Py_UCS4 in "
1013+
"the NamedSequence struct and in unicodedata_lookup")
1014+
self.named_sequences.append((name, chars))
1015+
# also store these in the PUA 1
1016+
self.table[pua_index][1] = name
1017+
pua_index += 1
10061018
assert pua_index - NAMED_SEQUENCES_START == len(self.named_sequences)
10071019

10081020
self.exclusions = {}
1009-
with open_data(COMPOSITION_EXCLUSIONS, version) as file:
1010-
for s in file:
1011-
s = s.strip()
1012-
if not s:
1013-
continue
1014-
if s[0] == '#':
1015-
continue
1016-
char = int(s.split()[0],16)
1017-
self.exclusions[char] = 1
1021+
for char, in UcdFile(COMPOSITION_EXCLUSIONS, version):
1022+
char = int(char, 16)
1023+
self.exclusions[char] = 1
10181024

10191025
widths = [None] * 0x110000
1020-
with open_data(EASTASIAN_WIDTH, version) as file:
1021-
for s in file:
1022-
s = s.strip()
1023-
if not s:
1024-
continue
1025-
if s[0] == '#':
1026-
continue
1027-
s = s.split()[0].split(';')
1028-
if '..' in s[0]:
1029-
first, last = [int(c, 16) for c in s[0].split('..')]
1030-
chars = list(range(first, last+1))
1031-
else:
1032-
chars = [int(s[0], 16)]
1033-
for char in chars:
1034-
widths[char] = s[1]
1026+
for s in UcdFile(EASTASIAN_WIDTH, version):
1027+
if '..' in s[0]:
1028+
first, last = [int(c, 16) for c in s[0].split('..')]
1029+
chars = list(range(first, last+1))
1030+
else:
1031+
chars = [int(s[0], 16)]
1032+
for char in chars:
1033+
widths[char] = s[1]
10351034

10361035
for i in range(0, 0x110000):
10371036
if table[i] is not None:
@@ -1041,38 +1040,27 @@ def __init__(self, version,
10411040
if table[i] is not None:
10421041
table[i].append(set())
10431042

1044-
with open_data(DERIVED_CORE_PROPERTIES, version) as file:
1045-
for s in file:
1046-
s = s.split('#', 1)[0].strip()
1047-
if not s:
1048-
continue
1049-
1050-
r, p = s.split(";")
1051-
r = r.strip()
1052-
p = p.strip()
1053-
if ".." in r:
1054-
first, last = [int(c, 16) for c in r.split('..')]
1055-
chars = list(range(first, last+1))
1056-
else:
1057-
chars = [int(r, 16)]
1058-
for char in chars:
1059-
if table[char]:
1060-
# Some properties (e.g. Default_Ignorable_Code_Point)
1061-
# apply to unassigned code points; ignore them
1062-
table[char][-1].add(p)
1063-
1064-
with open_data(LINE_BREAK, version) as file:
1065-
for s in file:
1066-
s = s.partition('#')[0]
1067-
s = [i.strip() for i in s.split(';')]
1068-
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1069-
continue
1070-
if '..' not in s[0]:
1071-
first = last = int(s[0], 16)
1072-
else:
1073-
first, last = [int(c, 16) for c in s[0].split('..')]
1074-
for char in range(first, last+1):
1075-
table[char][-1].add('Line_Break')
1043+
for r, p in UcdFile(DERIVED_CORE_PROPERTIES, version):
1044+
if ".." in r:
1045+
first, last = [int(c, 16) for c in r.split('..')]
1046+
chars = list(range(first, last+1))
1047+
else:
1048+
chars = [int(r, 16)]
1049+
for char in chars:
1050+
if table[char]:
1051+
# Some properties (e.g. Default_Ignorable_Code_Point)
1052+
# apply to unassigned code points; ignore them
1053+
table[char][-1].add(p)
1054+
1055+
for s in UcdFile(LINE_BREAK, version):
1056+
if len(s) < 2 or s[1] not in MANDATORY_LINE_BREAKS:
1057+
continue
1058+
if '..' not in s[0]:
1059+
first = last = int(s[0], 16)
1060+
else:
1061+
first, last = [int(c, 16) for c in s[0].split('..')]
1062+
for char in range(first, last+1):
1063+
table[char][-1].add('Line_Break')
10761064

10771065
# We only want the quickcheck properties
10781066
# Format: NF?_QC; Y(es)/N(o)/M(aybe)
@@ -1083,23 +1071,19 @@ def __init__(self, version,
10831071
# for older versions, and no delta records will be created.
10841072
quickchecks = [0] * 0x110000
10851073
qc_order = 'NFD_QC NFKD_QC NFC_QC NFKC_QC'.split()
1086-
with open_data(DERIVEDNORMALIZATION_PROPS, version) as file:
1087-
for s in file:
1088-
if '#' in s:
1089-
s = s[:s.index('#')]
1090-
s = [i.strip() for i in s.split(';')]
1091-
if len(s) < 2 or s[1] not in qc_order:
1092-
continue
1093-
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1094-
quickcheck_shift = qc_order.index(s[1])*2
1095-
quickcheck <<= quickcheck_shift
1096-
if '..' not in s[0]:
1097-
first = last = int(s[0], 16)
1098-
else:
1099-
first, last = [int(c, 16) for c in s[0].split('..')]
1100-
for char in range(first, last+1):
1101-
assert not (quickchecks[char]>>quickcheck_shift)&3
1102-
quickchecks[char] |= quickcheck
1074+
for s in UcdFile(DERIVEDNORMALIZATION_PROPS, version):
1075+
if len(s) < 2 or s[1] not in qc_order:
1076+
continue
1077+
quickcheck = 'MN'.index(s[2]) + 1 # Maybe or No
1078+
quickcheck_shift = qc_order.index(s[1])*2
1079+
quickcheck <<= quickcheck_shift
1080+
if '..' not in s[0]:
1081+
first = last = int(s[0], 16)
1082+
else:
1083+
first, last = [int(c, 16) for c in s[0].split('..')]
1084+
for char in range(first, last+1):
1085+
assert not (quickchecks[char]>>quickcheck_shift)&3
1086+
quickchecks[char] |= quickcheck
11031087
for i in range(0, 0x110000):
11041088
if table[i] is not None:
11051089
table[i].append(quickchecks[i])
@@ -1122,34 +1106,26 @@ def __init__(self, version,
11221106
# Patch the numeric field
11231107
if table[i] is not None:
11241108
table[i][8] = value
1109+
11251110
sc = self.special_casing = {}
1126-
with open_data(SPECIAL_CASING, version) as file:
1127-
for s in file:
1128-
s = s[:-1].split('#', 1)[0]
1129-
if not s:
1130-
continue
1131-
data = s.split("; ")
1132-
if data[4]:
1133-
# We ignore all conditionals (since they depend on
1134-
# languages) except for one, which is hardcoded. See
1135-
# handle_capital_sigma in unicodeobject.c.
1136-
continue
1137-
c = int(data[0], 16)
1138-
lower = [int(char, 16) for char in data[1].split()]
1139-
title = [int(char, 16) for char in data[2].split()]
1140-
upper = [int(char, 16) for char in data[3].split()]
1141-
sc[c] = (lower, title, upper)
1111+
for data in UcdFile(SPECIAL_CASING, version):
1112+
if data[4]:
1113+
# We ignore all conditionals (since they depend on
1114+
# languages) except for one, which is hardcoded. See
1115+
# handle_capital_sigma in unicodeobject.c.
1116+
continue
1117+
c = int(data[0], 16)
1118+
lower = [int(char, 16) for char in data[1].split()]
1119+
title = [int(char, 16) for char in data[2].split()]
1120+
upper = [int(char, 16) for char in data[3].split()]
1121+
sc[c] = (lower, title, upper)
1122+
11421123
cf = self.case_folding = {}
11431124
if version != '3.2.0':
1144-
with open_data(CASE_FOLDING, version) as file:
1145-
for s in file:
1146-
s = s[:-1].split('#', 1)[0]
1147-
if not s:
1148-
continue
1149-
data = s.split("; ")
1150-
if data[1] in "CF":
1151-
c = int(data[0], 16)
1152-
cf[c] = [int(char, 16) for char in data[2].split()]
1125+
for data in UcdFile(CASE_FOLDING, version):
1126+
if data[1] in "CF":
1127+
c = int(data[0], 16)
1128+
cf[c] = [int(char, 16) for char in data[2].split()]
11531129

11541130
def uselatin1(self):
11551131
# restrict character range to ISO Latin 1

0 commit comments

Comments
 (0)