Skip to content

Commit 104b8e7

Browse files
committed
Use a monkey-patched version of pgen2 in Parser/pgen
1 parent fdf2ac4 commit 104b8e7

File tree

3 files changed

+330
-8
lines changed

3 files changed

+330
-8
lines changed

Parser/pgen/__main__.py

Lines changed: 77 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,87 @@
11
import os
22
import sys
33
import argparse
4+
import collections
5+
6+
from lib2to3.pgen2 import pgen, grammar, tokenize
7+
8+
from . import token
9+
from . import grammar as pgen_grammar
10+
11+
for name in dir(token):
12+
setattr(tokenize, name, getattr(token, name))
13+
14+
pgen.token = token
15+
pgen.grammar = pgen_grammar
16+
17+
class ParserGenerator(pgen.ParserGenerator):
18+
def parse(self):
19+
dfas = collections.OrderedDict()
20+
startsymbol = None
21+
# MSTART: (NEWLINE | RULE)* ENDMARKER
22+
while self.type != token.ENDMARKER:
23+
while self.type == token.NEWLINE:
24+
self.gettoken()
25+
# RULE: NAME ':' RHS NEWLINE
26+
name = self.expect(token.NAME)
27+
self.expect(token.OP, ":")
28+
a, z = self.parse_rhs()
29+
self.expect(token.NEWLINE)
30+
dfa = self.make_dfa(a, z)
31+
oldlen = len(dfa)
32+
self.simplify_dfa(dfa)
33+
newlen = len(dfa)
34+
dfas[name] = dfa
35+
#print name, oldlen, newlen
36+
if startsymbol is None:
37+
startsymbol = name
38+
return dfas, startsymbol
39+
40+
def make_grammar(self, verbose=False):
41+
c = pgen_grammar.Grammar()
42+
names = list(self.dfas.keys())
43+
names.remove(self.startsymbol)
44+
names.insert(0, self.startsymbol)
45+
for name in names:
46+
i = 256 + len(c.symbol2number)
47+
c.symbol2number[name] = i
48+
c.number2symbol[i] = name
49+
for name in names:
50+
self.make_label(c, name)
51+
dfa = self.dfas[name]
52+
states = []
53+
for state in dfa:
54+
arcs = []
55+
for label, next in sorted(state.arcs.items()):
56+
arcs.append((self.make_label(c, label), dfa.index(next)))
57+
if state.isfinal:
58+
arcs.append((0, dfa.index(state)))
59+
states.append(arcs)
60+
c.states.append(states)
61+
c.dfas[c.symbol2number[name]] = (states, self.make_first(c, name))
62+
c.start = c.symbol2number[self.startsymbol]
63+
64+
if verbose:
65+
print("")
66+
print("Grammar summary")
67+
print("===============")
68+
69+
print("- {n_labels} labels".format(n_labels=len(c.labels)))
70+
print("- {n_dfas} dfas".format(n_dfas=len(c.dfas)))
71+
print("- {n_tokens} tokens".format(n_tokens=len(c.tokens)))
72+
print("- {n_keywords} keywords".format(n_keywords=len(c.keywords)))
73+
print(
74+
"- Start symbol: {start_symbol}".format(
75+
start_symbol=c.number2symbol[c.start]
76+
)
77+
)
78+
return c
479

5-
# Make sure we pick the current version of pgen2 (and not whatever is installed)
6-
CURRENT_FOLDER_LOCATION = os.path.dirname(os.path.realpath(__file__))
7-
LIB2TO3LOCATION = os.path.realpath(os.path.join(CURRENT_FOLDER_LOCATION,
8-
'..', '..', 'Lib', 'lib2to3'))
980

10-
sys.path.insert(0, LIB2TO3LOCATION)
11-
from pgen2 import pgen
12-
sys.path.pop(0)
1381

1482
def main(grammar_file, gramminit_h_file, gramminit_c_file, verbose):
15-
grammar = pgen.generate_grammar(grammar_file, verbose=verbose)
83+
p = ParserGenerator(grammar_file)
84+
grammar = p.make_grammar(verbose=verbose)
1685
grammar.produce_graminit_h(gramminit_h_file.write)
1786
grammar.produce_graminit_c(gramminit_c_file.write)
1887

Parser/pgen/grammar.py

Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
from lib2to3.pgen2 import grammar
2+
3+
from . import token
4+
5+
6+
class Grammar(grammar.Grammar):
7+
8+
def produce_graminit_h(self, writer):
9+
writer("/* Generated by Parser/pgen2 */\n\n")
10+
for number, symbol in self.number2symbol.items():
11+
writer("#define {} {}\n".format(symbol, number))
12+
13+
def produce_graminit_c(self, writer):
14+
writer("/* Generated by Parser/pgen2 */\n\n")
15+
16+
writer('#include "pgenheaders.h"\n')
17+
writer('#include "grammar.h"\n')
18+
writer("grammar _PyParser_Grammar;\n")
19+
20+
self.print_dfas(writer)
21+
self.print_labels(writer)
22+
23+
writer("grammar _PyParser_Grammar = {\n")
24+
writer(" {n_dfas},\n".format(n_dfas=len(self.dfas)))
25+
writer(" dfas,\n")
26+
writer(" {{{n_labels}, labels}},\n".format(n_labels=len(self.labels)))
27+
writer(" {start_number}\n".format(start_number=self.start))
28+
writer("};\n")
29+
30+
def print_labels(self, writer):
31+
writer(
32+
"static label labels[{n_labels}] = {{\n".format(n_labels=len(self.labels))
33+
)
34+
for label, name in self.labels:
35+
if name is None:
36+
writer(" {{{label}, 0}},\n".format(label=label))
37+
else:
38+
writer(
39+
' {{{label}, "{label_name}"}},\n'.format(
40+
label=label, label_name=name
41+
)
42+
)
43+
writer("};\n")
44+
45+
def print_dfas(self, writer):
46+
self.print_states(writer)
47+
writer("static dfa dfas[{}] = {{\n".format(len(self.dfas)))
48+
for dfaindex, dfa_elem in enumerate(self.dfas.items()):
49+
symbol, (dfa, first_sets) = dfa_elem
50+
writer(
51+
' {{{dfa_symbol}, "{symbol_name}", '.format(
52+
dfa_symbol=symbol, symbol_name=self.number2symbol[symbol]
53+
)
54+
+ "0, {n_states}, states_{dfa_index},\n".format(
55+
n_states=len(dfa), dfa_index=dfaindex
56+
)
57+
)
58+
writer(' "')
59+
60+
k = [name for label, name in self.labels if label in first_sets]
61+
bitset = bytearray((len(self.labels) >> 3) + 1)
62+
for token in first_sets:
63+
bitset[token >> 3] |= 1 << (token & 7)
64+
for byte in bitset:
65+
writer("\\%03o" % (byte & 0xFF))
66+
writer('"},\n')
67+
writer("};\n")
68+
69+
def print_states(self, write):
70+
for dfaindex, dfa in enumerate(self.states):
71+
self.print_arcs(write, dfaindex, dfa)
72+
write(
73+
"static state states_{dfa_index}[{n_states}] = {{\n".format(
74+
dfa_index=dfaindex, n_states=len(dfa)
75+
)
76+
)
77+
for stateindex, state in enumerate(dfa):
78+
narcs = len(state)
79+
write(
80+
" {{{n_arcs}, arcs_{dfa_index}_{state_index}}},\n".format(
81+
n_arcs=narcs, dfa_index=dfaindex, state_index=stateindex
82+
)
83+
)
84+
write("};\n")
85+
86+
def print_arcs(self, write, dfaindex, states):
87+
for stateindex, state in enumerate(states):
88+
narcs = len(state)
89+
write(
90+
"static arc arcs_{dfa_index}_{state_index}[{n_arcs}] = {{\n".format(
91+
dfa_index=dfaindex, state_index=stateindex, n_arcs=narcs
92+
)
93+
)
94+
for a, b in state:
95+
write(
96+
" {{{from_label}, {to_state}}},\n".format(
97+
from_label=a, to_state=b
98+
)
99+
)
100+
write("};\n")
101+
102+
103+
104+
105+
opmap_raw = """
106+
( LPAR
107+
) RPAR
108+
[ LSQB
109+
] RSQB
110+
: COLON
111+
, COMMA
112+
; SEMI
113+
+ PLUS
114+
- MINUS
115+
* STAR
116+
/ SLASH
117+
| VBAR
118+
& AMPER
119+
< LESS
120+
> GREATER
121+
= EQUAL
122+
. DOT
123+
% PERCENT
124+
{ LBRACE
125+
} RBRACE
126+
== EQEQUAL
127+
!= NOTEQUAL
128+
<> NOTEQUAL
129+
<= LESSEQUAL
130+
>= GREATEREQUAL
131+
~ TILDE
132+
^ CIRCUMFLEX
133+
<< LEFTSHIFT
134+
>> RIGHTSHIFT
135+
** DOUBLESTAR
136+
+= PLUSEQUAL
137+
-= MINEQUAL
138+
*= STAREQUAL
139+
/= SLASHEQUAL
140+
%= PERCENTEQUAL
141+
&= AMPEREQUAL
142+
|= VBAREQUAL
143+
^= CIRCUMFLEXEQUAL
144+
<<= LEFTSHIFTEQUAL
145+
>>= RIGHTSHIFTEQUAL
146+
**= DOUBLESTAREQUAL
147+
// DOUBLESLASH
148+
//= DOUBLESLASHEQUAL
149+
@ AT
150+
@= ATEQUAL
151+
-> RARROW
152+
... ELLIPSIS
153+
:= COLONEQUAL
154+
` BACKQUOTE
155+
"""
156+
157+
opmap = {}
158+
for line in opmap_raw.splitlines():
159+
if line:
160+
op, name = line.split()
161+
opmap[op] = getattr(token, name)

Parser/pgen/token.py

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#! /usr/bin/env python3
2+
3+
"""Token constants (from "token.h")."""
4+
5+
# Taken from Python (r53757) and modified to include some tokens
6+
# originally monkeypatched in by pgen2.tokenize
7+
8+
# --start constants--
9+
ENDMARKER = 0
10+
NAME = 1
11+
NUMBER = 2
12+
STRING = 3
13+
NEWLINE = 4
14+
INDENT = 5
15+
DEDENT = 6
16+
LPAR = 7
17+
RPAR = 8
18+
LSQB = 9
19+
RSQB = 10
20+
COLON = 11
21+
COMMA = 12
22+
SEMI = 13
23+
PLUS = 14
24+
MINUS = 15
25+
STAR = 16
26+
SLASH = 17
27+
VBAR = 18
28+
AMPER = 19
29+
LESS = 20
30+
GREATER = 21
31+
EQUAL = 22
32+
DOT = 23
33+
PERCENT = 24
34+
LBRACE = 25
35+
RBRACE = 26
36+
EQEQUAL = 27
37+
NOTEQUAL = 28
38+
LESSEQUAL = 29
39+
GREATEREQUAL = 30
40+
TILDE = 31
41+
CIRCUMFLEX = 32
42+
LEFTSHIFT = 33
43+
RIGHTSHIFT = 34
44+
DOUBLESTAR = 35
45+
PLUSEQUAL = 36
46+
MINEQUAL = 37
47+
STAREQUAL = 38
48+
SLASHEQUAL = 39
49+
PERCENTEQUAL = 40
50+
AMPEREQUAL = 41
51+
VBAREQUAL = 42
52+
CIRCUMFLEXEQUAL = 43
53+
LEFTSHIFTEQUAL = 44
54+
RIGHTSHIFTEQUAL = 45
55+
DOUBLESTAREQUAL = 46
56+
DOUBLESLASH = 47
57+
DOUBLESLASHEQUAL = 48
58+
AT = 49
59+
ATEQUAL = 50
60+
RARROW = 51
61+
ELLIPSIS = 52
62+
COLONEQUAL = 53
63+
OP = 54
64+
TYPE_IGNORE = 55
65+
TYPE_COMMENT = 56
66+
ERRORTOKEN = 57
67+
COMMENT = 58
68+
NL = 59
69+
ENCODING = 60
70+
ASYNC = 61
71+
AWAIT = 62
72+
BACKQUOTE = 63
73+
N_TOKENS = 64
74+
NT_OFFSET = 256
75+
# --end constants--
76+
77+
tok_name = {}
78+
for _name, _value in list(globals().items()):
79+
if type(_value) is type(0):
80+
tok_name[_value] = _name
81+
82+
83+
def ISTERMINAL(x):
84+
return x < NT_OFFSET
85+
86+
87+
def ISNONTERMINAL(x):
88+
return x >= NT_OFFSET
89+
90+
91+
def ISEOF(x):
92+
return x == ENDMARKER

0 commit comments

Comments
 (0)