Skip to content

Commit b1fae75

Browse files
committed
Use tokenize from stdlib, detach completely from lib2to3 and fix some typos
1 parent 3d593ef commit b1fae75

File tree

5 files changed

+140
-48
lines changed

5 files changed

+140
-48
lines changed

Makefile.pre.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,7 @@ regen-grammar: regen-token
788788
# from Grammar/Grammar using pgen
789789
@$(MKDIR_P) Include
790790
$(PYTHON_FOR_REGEN) -m Parser.pgen $(srcdir)/Grammar/Grammar \
791+
$(srcdir)/Grammar/Tokens \
791792
$(srcdir)/Include/graminit.h.new \
792793
$(srcdir)/Python/graminit.c.new
793794
$(UPDATE_FILE) $(srcdir)/Include/graminit.h $(srcdir)/Include/graminit.h.new

Parser/pgen/__main__.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -8,22 +8,26 @@ def main():
88
"grammar", type=str, help="The file with the grammar definition in EBNF format"
99
)
1010
parser.add_argument(
11-
"gramminit_h",
11+
"tokens", type=str, help="The file with the token definitions"
12+
)
13+
parser.add_argument(
14+
"graminit_h",
1215
type=argparse.FileType('w'),
1316
help="The path to write the grammar's non-terminals as #defines",
1417
)
1518
parser.add_argument(
16-
"gramminit_c",
19+
"graminit_c",
1720
type=argparse.FileType('w'),
1821
help="The path to write the grammar as initialized data",
1922
)
23+
2024
parser.add_argument("--verbose", "-v", action="count")
2125
args = parser.parse_args()
2226

23-
p = ParserGenerator(args.grammar, verbose=args.verbose)
27+
p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
2428
grammar = p.make_grammar()
25-
grammar.produce_graminit_h(args.gramminit_h.write)
26-
grammar.produce_graminit_c(args.gramminit_c.write)
29+
grammar.produce_graminit_h(args.graminit_h.write)
30+
grammar.produce_graminit_c(args.graminit_c.write)
2731

2832

2933
if __name__ == "__main__":

Parser/pgen/grammar.py

Lines changed: 65 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,69 @@
1-
from lib2to3.pgen2 import grammar
1+
import collections
22

3-
class Grammar(grammar.Grammar):
3+
class Grammar:
4+
"""Pgen parsing tables conversion class.
5+
6+
Once initialized, this class supplies the grammar tables for the
7+
parsing engine implemented by parse.py. The parsing engine
8+
accesses the instance variables directly. The class here does not
9+
provide initialization of the tables; several subclasses exist to
10+
do this (see the conv and pgen modules).
11+
12+
The load() method reads the tables from a pickle file, which is
13+
much faster than the other ways offered by subclasses. The pickle
14+
file is written by calling dump() (after loading the grammar
15+
tables using a subclass). The report() method prints a readable
16+
representation of the tables to stdout, for debugging.
17+
18+
The instance variables are as follows:
19+
20+
symbol2number -- a dict mapping symbol names to numbers. Symbol
21+
numbers are always 256 or higher, to distinguish
22+
them from token numbers, which are between 0 and
23+
255 (inclusive).
24+
25+
number2symbol -- a dict mapping numbers to symbol names;
26+
these two are each other's inverse.
27+
28+
states -- a list of DFAs, where each DFA is a list of
29+
states, each state is a list of arcs, and each
30+
arc is a (i, j) pair where i is a label and j is
31+
a state number. The DFA number is the index into
32+
this list. (This name is slightly confusing.)
33+
Final states are represented by a special arc of
34+
the form (0, j) where j is its own state number.
35+
36+
dfas -- a dict mapping symbol numbers to (DFA, first)
37+
pairs, where DFA is an item from the states list
38+
above, and first is a set of tokens that can
39+
begin this grammar rule (represented by a dict
40+
whose values are always 1).
41+
42+
labels -- a list of (x, y) pairs where x is either a token
43+
number or a symbol number, and y is either None
44+
or a string; the strings are keywords. The label
45+
number is the index in this list; label numbers
46+
are used to mark state transitions (arcs) in the
47+
DFAs.
48+
49+
start -- the number of the grammar's start symbol.
50+
51+
keywords -- a dict mapping keyword strings to arc labels.
52+
53+
tokens -- a dict mapping token numbers to arc labels.
54+
55+
"""
56+
57+
def __init__(self):
58+
self.symbol2number = collections.OrderedDict()
59+
self.number2symbol = collections.OrderedDict()
60+
self.states = []
61+
self.dfas = collections.OrderedDict()
62+
self.labels = [(0, "EMPTY")]
63+
self.keywords = collections.OrderedDict()
64+
self.tokens = collections.OrderedDict()
65+
self.symbol2label = collections.OrderedDict()
66+
self.start = 256
467

568
def produce_graminit_h(self, writer):
669
writer("/* Generated by Parser/pgen */\n\n")

Parser/pgen/pgen.py

Lines changed: 25 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -1,40 +1,23 @@
1-
import os
2-
import sys
31
import collections
4-
import importlib.machinery
2+
import tokenize # from stdlib
53

6-
# Use Lib/token.py and Lib/tokenize.py to obtain the tokens. To maintain this
7-
# compatible with older versions of Python, we need to make sure that we only
8-
# import these two files (and not any of the dependencies of these files).
9-
10-
CURRENT_FOLDER_LOCATION = os.path.dirname(os.path.realpath(__file__))
11-
LIB_LOCATION = os.path.realpath(os.path.join(CURRENT_FOLDER_LOCATION, '..', '..', 'Lib'))
12-
TOKEN_LOCATION = os.path.join(LIB_LOCATION, 'token.py')
13-
TOKENIZE_LOCATION = os.path.join(LIB_LOCATION, 'tokenize.py')
14-
15-
token = importlib.machinery.SourceFileLoader('token',
16-
TOKEN_LOCATION).load_module()
17-
# Add token to the module cache so tokenize.py uses that excact one instead of
18-
# the one in the stdlib of the interpreter executing this file.
19-
sys.modules['token'] = token
20-
tokenize = importlib.machinery.SourceFileLoader('tokenize',
21-
TOKENIZE_LOCATION).load_module()
22-
23-
from . import grammar
4+
from . import grammar, token
245

256
class ParserGenerator(object):
267

27-
def __init__(self, filename, stream=None, verbose=False):
8+
def __init__(self, grammar_file, token_file, stream=None, verbose=False):
289
close_stream = None
2910
if stream is None:
30-
stream = open(filename)
11+
stream = open(grammar_file)
3112
close_stream = stream.close
32-
self.tokens = token
33-
self.opmap = token.EXACT_TOKEN_TYPES
13+
with open(token_file) as tok_file:
14+
token_lines = tok_file.readlines()
15+
self.tokens = dict(token.generate_tokens(token_lines))
16+
self.opmap = dict(token.generate_opmap(token_lines))
3417
# Manually add <> so it does not collide with !=
35-
self.opmap['<>'] = self.tokens.NOTEQUAL
18+
self.opmap['<>'] = "NOTEQUAL"
3619
self.verbose = verbose
37-
self.filename = filename
20+
self.filename = grammar_file
3821
self.stream = stream
3922
self.generator = tokenize.generate_tokens(stream.readline)
4023
self.gettoken() # Initialize lookahead
@@ -108,9 +91,9 @@ def make_label(self, c, label):
10891
return ilabel
10992
else:
11093
# A named token (NAME, NUMBER, STRING)
111-
itoken = getattr(self.tokens, label, None)
94+
itoken = self.tokens.get(label, None)
11295
assert isinstance(itoken, int), label
113-
assert itoken in self.tokens.tok_name, label
96+
assert itoken in self.tokens.values(), label
11497
if itoken in c.tokens:
11598
return c.tokens[itoken]
11699
else:
@@ -126,12 +109,13 @@ def make_label(self, c, label):
126109
if value in c.keywords:
127110
return c.keywords[value]
128111
else:
129-
c.labels.append((self.tokens.NAME, value))
112+
c.labels.append((self.tokens["NAME"], value))
130113
c.keywords[value] = ilabel
131114
return ilabel
132115
else:
133116
# An operator (any non-numeric token)
134-
itoken = self.opmap[value] # Fails if unknown token
117+
tok_name = self.opmap[value] # Fails if unknown token
118+
itoken = self.tokens[tok_name]
135119
if itoken in c.tokens:
136120
return c.tokens[itoken]
137121
else:
@@ -184,16 +168,16 @@ def parse(self):
184168
dfas = collections.OrderedDict()
185169
startsymbol = None
186170
# MSTART: (NEWLINE | RULE)* ENDMARKER
187-
while self.type != self.tokens.ENDMARKER:
188-
while self.type == self.tokens.NEWLINE:
171+
while self.type != tokenize.ENDMARKER:
172+
while self.type == tokenize.NEWLINE:
189173
self.gettoken()
190174
# RULE: NAME ':' RHS NEWLINE
191-
name = self.expect(self.tokens.NAME)
175+
name = self.expect(tokenize.NAME)
192176
if self.verbose:
193177
print("Processing rule {dfa_name}".format(dfa_name=name))
194-
self.expect(self.tokens.OP, ":")
178+
self.expect(tokenize.OP, ":")
195179
a, z = self.parse_rhs()
196-
self.expect(self.tokens.NEWLINE)
180+
self.expect(tokenize.NEWLINE)
197181
if self.verbose:
198182
self.dump_nfa(name, a, z)
199183
dfa = self.make_dfa(a, z)
@@ -309,7 +293,7 @@ def parse_alt(self):
309293
# ALT: ITEM+
310294
a, b = self.parse_item()
311295
while (self.value in ("(", "[") or
312-
self.type in (self.tokens.NAME, self.tokens.STRING)):
296+
self.type in (tokenize.NAME, tokenize.STRING)):
313297
c, d = self.parse_item()
314298
b.addarc(c)
315299
b = d
@@ -320,7 +304,7 @@ def parse_item(self):
320304
if self.value == "[":
321305
self.gettoken()
322306
a, z = self.parse_rhs()
323-
self.expect(self.tokens.OP, "]")
307+
self.expect(tokenize.OP, "]")
324308
a.addarc(z)
325309
return a, z
326310
else:
@@ -340,9 +324,9 @@ def parse_atom(self):
340324
if self.value == "(":
341325
self.gettoken()
342326
a, z = self.parse_rhs()
343-
self.expect(self.tokens.OP, ")")
327+
self.expect(tokenize.OP, ")")
344328
return a, z
345-
elif self.type in (self.tokens.NAME, self.tokens.STRING):
329+
elif self.type in (tokenize.NAME, tokenize.STRING):
346330
a = NFAState()
347331
z = NFAState()
348332
a.addarc(z, self.value)
@@ -365,7 +349,7 @@ def gettoken(self):
365349
while tup[0] in (tokenize.COMMENT, tokenize.NL):
366350
tup = next(self.generator)
367351
self.type, self.value, self.begin, self.end, self.line = tup
368-
#print self.tokens['tok_name'][self.type], repr(self.value)
352+
# print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
369353

370354
def raise_error(self, msg, *args):
371355
if args:

Parser/pgen/token.py

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import itertools
2+
3+
def generate_tokens(tokens):
4+
numbers = itertools.count(0)
5+
for line in tokens:
6+
line = line.strip()
7+
8+
if not line:
9+
continue
10+
if line.strip().startswith('#'):
11+
continue
12+
13+
name = line.split()[0]
14+
yield (name, next(numbers))
15+
16+
yield ('N_TOKENS', next(numbers))
17+
yield ('NT_OFFSET', 256)
18+
19+
def generate_opmap(tokens):
20+
for line in tokens:
21+
line = line.strip()
22+
23+
if not line:
24+
continue
25+
if line.strip().startswith('#'):
26+
continue
27+
28+
pieces = line.split()
29+
30+
if len(pieces) != 2:
31+
continue
32+
33+
name, op = pieces
34+
yield (op.strip("'"), name)
35+
36+
# Yield independently <>. This is needed so it does not collide
37+
# with the token generation in "generate_tokens" because if this
38+
# symbol is included in Grammar/Tokens, it will collide with !=
39+
# as it has the same name (NOTEQUAL).
40+
yield ('<>', 'NOTEQUAL')

0 commit comments

Comments
 (0)