Skip to content

Commit 8e6005c

Browse files
committed
Modify pgen2 to make it produce pgen compatible output
1 parent 3dc67d0 commit 8e6005c

File tree

4 files changed

+242
-116
lines changed

4 files changed

+242
-116
lines changed

Lib/lib2to3/pgen2/grammar.py

Lines changed: 50 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,7 @@ class Grammar(object):
5555
dfas -- a dict mapping symbol numbers to (DFA, first)
5656
pairs, where DFA is an item from the states list
5757
above, and first is a set of tokens that can
58-
begin this grammar rule (represented by a dict
59-
whose values are always 1).
58+
begin this grammar rule.
6059
6160
labels -- a list of (x, y) pairs where x is either a token
6261
number or a symbol number, and y is either None
@@ -131,53 +130,55 @@ def report(self):
131130
# Map from operator to number (since tokenize doesn't do this)
132131

133132
opmap_raw = """
134-
( LPAR
135-
) RPAR
136-
[ LSQB
137-
] RSQB
138-
: COLON
139-
, COMMA
140-
; SEMI
141-
+ PLUS
142-
- MINUS
143-
* STAR
144-
/ SLASH
145-
| VBAR
146-
& AMPER
147-
< LESS
148-
> GREATER
149-
= EQUAL
150-
. DOT
151-
% PERCENT
152-
` BACKQUOTE
153-
{ LBRACE
154-
} RBRACE
155-
@ AT
156-
@= ATEQUAL
157-
== EQEQUAL
158-
!= NOTEQUAL
159-
<> NOTEQUAL
160-
<= LESSEQUAL
161-
>= GREATEREQUAL
162-
~ TILDE
163-
^ CIRCUMFLEX
164-
<< LEFTSHIFT
165-
>> RIGHTSHIFT
166-
** DOUBLESTAR
167-
+= PLUSEQUAL
168-
-= MINEQUAL
169-
*= STAREQUAL
170-
/= SLASHEQUAL
171-
%= PERCENTEQUAL
172-
&= AMPEREQUAL
173-
|= VBAREQUAL
174-
^= CIRCUMFLEXEQUAL
175-
<<= LEFTSHIFTEQUAL
176-
>>= RIGHTSHIFTEQUAL
177-
**= DOUBLESTAREQUAL
178-
// DOUBLESLASH
179-
//= DOUBLESLASHEQUAL
180-
-> RARROW
133+
( LPAR
134+
) RPAR
135+
[ LSQB
136+
] RSQB
137+
: COLON
138+
, COMMA
139+
; SEMI
140+
+ PLUS
141+
- MINUS
142+
* STAR
143+
/ SLASH
144+
| VBAR
145+
& AMPER
146+
< LESS
147+
> GREATER
148+
= EQUAL
149+
. DOT
150+
% PERCENT
151+
{ LBRACE
152+
} RBRACE
153+
== EQEQUAL
154+
!= NOTEQUAL
155+
<> NOTEQUAL
156+
<= LESSEQUAL
157+
>= GREATEREQUAL
158+
~ TILDE
159+
^ CIRCUMFLEX
160+
<< LEFTSHIFT
161+
>> RIGHTSHIFT
162+
** DOUBLESTAR
163+
+= PLUSEQUAL
164+
-= MINEQUAL
165+
*= STAREQUAL
166+
/= SLASHEQUAL
167+
%= PERCENTEQUAL
168+
&= AMPEREQUAL
169+
|= VBAREQUAL
170+
^= CIRCUMFLEXEQUAL
171+
<<= LEFTSHIFTEQUAL
172+
>>= RIGHTSHIFTEQUAL
173+
**= DOUBLESTAREQUAL
174+
// DOUBLESLASH
175+
//= DOUBLESLASHEQUAL
176+
@ AT
177+
@= ATEQUAL
178+
-> RARROW
179+
... ELLIPSIS
180+
:= COLONEQUAL
181+
` BACKQUOTE
181182
"""
182183

183184
opmap = {}

Lib/lib2to3/pgen2/pgen.py

Lines changed: 137 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,113 @@
33

44
# Pgen imports
55
from . import grammar, token, tokenize
6+
import collections
7+
68

79
class PgenGrammar(grammar.Grammar):
8-
pass
10+
def produce_graminit_h(self, writer):
11+
writer("/* Generated by Parser/pgen2 */\n\n")
12+
for number, symbol in self.number2symbol.items():
13+
writer("#define {} {}\n".format(symbol, number))
14+
15+
def produce_graminit_c(self, writer):
16+
writer("/* Generated by Parser/pgen2 */\n\n")
17+
18+
writer('#include "pgenheaders.h"\n')
19+
writer('#include "grammar.h"\n')
20+
writer("grammar _PyParser_Grammar;\n")
21+
22+
self.print_dfas(writer)
23+
self.print_labels(writer)
24+
25+
writer("grammar _PyParser_Grammar = {\n")
26+
writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
27+
writer(" dfas,\n")
28+
writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
29+
writer(" {start_number}\n".format(start_number=self.start))
30+
writer("};\n")
31+
32+
def print_labels(self, writer):
33+
writer(
34+
"static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
35+
)
36+
for label, name in self.labels:
37+
if name is None:
38+
writer(" {{{label}, 0}},\n".format(label=label))
39+
else:
40+
writer(
41+
' {{{label}, "{label_name}"}},\n'.format(
42+
label=label, label_name=name
43+
)
44+
)
45+
writer("};\n")
46+
47+
def print_dfas(self, writer):
48+
self.print_states(writer)
49+
writer("static dfa dfas[{}] = {{\n".format(len(self.dfas)))
50+
for dfaindex, dfa_elem in enumerate(self.dfas.items()):
51+
symbol, (dfa, first_sets) = dfa_elem
52+
writer(
53+
' {{{dfa_symbol}, "{symbol_name}", '.format(
54+
dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
55+
)
56+
+ "0, {n_states}, states_{dfa_index},\n".format(
57+
n_states=len(dfa), dfa_index=dfaindex
58+
)
59+
)
60+
writer(' "')
61+
62+
k = [name for label, name in self.labels if label in first_sets]
63+
bitset = bytearray((len(self.labels) >> 3) + 1)
64+
for token in first_sets:
65+
bitset[token >> 3] |= 1 << (token & 7)
66+
for byte in bitset:
67+
writer("\\%03o" % (byte & 0xFF))
68+
writer('"},\n')
69+
writer("};\n")
70+
71+
def print_states(self, write):
72+
for dfaindex, dfa in enumerate(self.states):
73+
self.print_arcs(write, dfaindex, dfa)
74+
write(
75+
"static state states_{dfa_index}[{n_states}] = {{\n".format(
76+
dfa_index=dfaindex, n_states=len(dfa)
77+
)
78+
)
79+
for stateindex, state in enumerate(dfa):
80+
narcs = len(state)
81+
write(
82+
" {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
83+
n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
84+
)
85+
)
86+
write("};\n")
87+
88+
def print_arcs(self, write, dfaindex, states):
89+
for stateindex, state in enumerate(states):
90+
narcs = len(state)
91+
write(
92+
"static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
93+
dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
94+
)
95+
)
96+
for a, b in state:
97+
write(
98+
" {{{from_label}, {to_state}}},\n".format(
99+
from_label=a, to_state=b
100+
)
101+
)
102+
write("};\n")
103+
9104

10105
class ParserGenerator(object):
11106

12-
def __init__(self, filename, stream=None):
107+
def __init__(self, filename, stream=None, verbose=False):
13108
close_stream = None
14109
if stream is None:
15110
stream = open(filename)
16111
close_stream = stream.close
112+
self.verbose = verbose
17113
self.filename = filename
18114
self.stream = stream
19115
self.generator = tokenize.generate_tokens(stream.readline)
@@ -27,14 +123,14 @@ def __init__(self, filename, stream=None):
27123
def make_grammar(self):
28124
c = PgenGrammar()
29125
names = list(self.dfas.keys())
30-
names.sort()
31126
names.remove(self.startsymbol)
32127
names.insert(0, self.startsymbol)
33128
for name in names:
34129
i = 256 + len(c.symbol2number)
35130
c.symbol2number[name] = i
36131
c.number2symbol[i] = name
37132
for name in names:
133+
self.make_label(c, name)
38134
dfa = self.dfas[name]
39135
states = []
40136
for state in dfa:
@@ -47,15 +143,30 @@ def make_grammar(self):
47143
c.states.append(states)
48144
c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
49145
c.start = c.symbol2number[self.startsymbol]
146+
147+
if self.verbose:
148+
print("")
149+
print("Grammar summary")
150+
print("===============")
151+
152+
print("- {n_labels} labels".format(n_labels=len(c.labels)))
153+
print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
154+
print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
155+
print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
156+
print(
157+
"- Start symbol: {start_symbol}".format(
158+
start_symbol=c.number2symbol[c.start]
159+
)
160+
)
50161
return c
51162

52163
def make_first(self, c, name):
53164
rawfirst = self.first[name]
54-
first = {}
165+
first = set()
55166
for label in sorted(rawfirst):
56167
ilabel = self.make_label(c, label)
57168
##assert ilabel not in first # XXX failed on <> ... !=
58-
first[ilabel] = 1
169+
first.add(ilabel)
59170
return first
60171

61172
def make_label(self, c, label):
@@ -106,17 +217,20 @@ def make_label(self, c, label):
106217

107218
def addfirstsets(self):
108219
names = list(self.dfas.keys())
109-
names.sort()
110220
for name in names:
111221
if name not in self.first:
112222
self.calcfirst(name)
113-
#print name, self.first[name].keys()
223+
224+
if self.verbose:
225+
print("First set for {dfa_name}".format(dfa_name=name))
226+
for item in self.first[name]:
227+
print(" - {terminal}".format(terminal=item))
114228

115229
def calcfirst(self, name):
116230
dfa = self.dfas[name]
117231
self.first[name] = None # dummy to detect left recursion
118232
state = dfa[0]
119-
totalset = {}
233+
totalset = set()
120234
overlapcheck = {}
121235
for label, next in state.arcs.items():
122236
if label in self.dfas:
@@ -130,8 +244,8 @@ def calcfirst(self, name):
130244
totalset.update(fset)
131245
overlapcheck[label] = fset
132246
else:
133-
totalset[label] = 1
134-
overlapcheck[label] = {label: 1}
247+
totalset.add(label)
248+
overlapcheck[label] = {label}
135249
inverse = {}
136250
for label, itsfirst in overlapcheck.items():
137251
for symbol in itsfirst:
@@ -143,20 +257,24 @@ def calcfirst(self, name):
143257
self.first[name] = totalset
144258

145259
def parse(self):
146-
dfas = {}
260+
dfas = collections.OrderedDict()
147261
startsymbol = None
148262
# MSTART: (NEWLINE | RULE)* ENDMARKER
149263
while self.type != token.ENDMARKER:
150264
while self.type == token.NEWLINE:
151265
self.gettoken()
152266
# RULE: NAME ':' RHS NEWLINE
153267
name = self.expect(token.NAME)
268+
if self.verbose:
269+
print("Processing rule {dfa_name}".format(dfa_name=name))
154270
self.expect(token.OP, ":")
155271
a, z = self.parse_rhs()
156272
self.expect(token.NEWLINE)
157-
#self.dump_nfa(name, a, z)
273+
if self.verbose:
274+
self.dump_nfa(name, a, z)
158275
dfa = self.make_dfa(a, z)
159-
#self.dump_dfa(name, dfa)
276+
if self.verbose:
277+
self.dump_dfa(name, dfa)
160278
oldlen = len(dfa)
161279
self.simplify_dfa(dfa)
162280
newlen = len(dfa)
@@ -174,14 +292,14 @@ def make_dfa(self, start, finish):
174292
assert isinstance(start, NFAState)
175293
assert isinstance(finish, NFAState)
176294
def closure(state):
177-
base = {}
295+
base = set()
178296
addclosure(state, base)
179297
return base
180298
def addclosure(state, base):
181299
assert isinstance(state, NFAState)
182300
if state in base:
183301
return
184-
base[state] = 1
302+
base.add(state)
185303
for label, next in state.arcs:
186304
if label is None:
187305
addclosure(next, base)
@@ -191,7 +309,7 @@ def addclosure(state, base):
191309
for nfastate in state.nfaset:
192310
for label, next in nfastate.arcs:
193311
if label is not None:
194-
addclosure(next, arcs.setdefault(label, {}))
312+
addclosure(next, arcs.setdefault(label, set()))
195313
for label, nfaset in sorted(arcs.items()):
196314
for st in states:
197315
if st.nfaset == nfaset:
@@ -347,7 +465,7 @@ def addarc(self, next, label=None):
347465
class DFAState(object):
348466

349467
def __init__(self, nfaset, final):
350-
assert isinstance(nfaset, dict)
468+
assert isinstance(nfaset, set)
351469
assert isinstance(next(iter(nfaset)), NFAState)
352470
assert isinstance(final, NFAState)
353471
self.nfaset = nfaset
@@ -381,6 +499,6 @@ def __eq__(self, other):
381499

382500
__hash__ = None # For Py3 compatibility.
383501

384-
def generate_grammar(filename="Grammar.txt"):
385-
p = ParserGenerator(filename)
502+
def generate_grammar(filename="Grammar.txt", verbose=False):
503+
p = ParserGenerator(filename, verbose=verbose)
386504
return p.make_grammar()

0 commit comments

Comments
 (0)