Use tokenize from stdlib, detach completely from lib2to3 and fix some typos

pablogsal · pablogsal · commit b1fae7526e10 · 2019-02-22T17:32:21.000Z
diff --git a/Makefile.pre.in b/Makefile.pre.in
@@ -788,6 +788,7 @@ regen-grammar: regen-token
 	# from Grammar/Grammar using pgen
 	@$(MKDIR_P) Include
 	$(PYTHON_FOR_REGEN) -m Parser.pgen $(srcdir)/Grammar/Grammar \
+		$(srcdir)/Grammar/Tokens \
 		$(srcdir)/Include/graminit.h.new \
 		$(srcdir)/Python/graminit.c.new
 	$(UPDATE_FILE) $(srcdir)/Include/graminit.h $(srcdir)/Include/graminit.h.new
diff --git a/Parser/pgen/__main__.py b/Parser/pgen/__main__.py
@@ -8,22 +8,26 @@ def main():
         "grammar", type=str, help="The file with the grammar definition in EBNF format"
     )
     parser.add_argument(
-        "gramminit_h",
+        "tokens", type=str, help="The file with the token definitions"
+    )
+    parser.add_argument(
+        "graminit_h",
         type=argparse.FileType('w'),
         help="The path to write the grammar's non-terminals as #defines",
     )
     parser.add_argument(
-        "gramminit_c",
+        "graminit_c",
         type=argparse.FileType('w'),
         help="The path to write the grammar as initialized data",
     )
+
     parser.add_argument("--verbose", "-v", action="count")
     args = parser.parse_args()
 
-    p = ParserGenerator(args.grammar, verbose=args.verbose)
+    p = ParserGenerator(args.grammar, args.tokens, verbose=args.verbose)
     grammar = p.make_grammar()
-    grammar.produce_graminit_h(args.gramminit_h.write)
-    grammar.produce_graminit_c(args.gramminit_c.write)
+    grammar.produce_graminit_h(args.graminit_h.write)
+    grammar.produce_graminit_c(args.graminit_c.write)
 
 
 if __name__ == "__main__":
diff --git a/Parser/pgen/grammar.py b/Parser/pgen/grammar.py
@@ -1,6 +1,69 @@
-from lib2to3.pgen2 import grammar
+import collections
 
-class Grammar(grammar.Grammar):
+class Grammar:
+    """Pgen parsing tables conversion class.
+
+    Once initialized, this class supplies the grammar tables for the
+    parsing engine implemented by parse.py.  The parsing engine
+    accesses the instance variables directly.  The class here does not
+    provide initialization of the tables; several subclasses exist to
+    do this (see the conv and pgen modules).
+
+    The load() method reads the tables from a pickle file, which is
+    much faster than the other ways offered by subclasses.  The pickle
+    file is written by calling dump() (after loading the grammar
+    tables using a subclass).  The report() method prints a readable
+    representation of the tables to stdout, for debugging.
+
+    The instance variables are as follows:
+
+    symbol2number -- a dict mapping symbol names to numbers.  Symbol
+                     numbers are always 256 or higher, to distinguish
+                     them from token numbers, which are between 0 and
+                     255 (inclusive).
+
+    number2symbol -- a dict mapping numbers to symbol names;
+                     these two are each other's inverse.
+
+    states        -- a list of DFAs, where each DFA is a list of
+                     states, each state is a list of arcs, and each
+                     arc is a (i, j) pair where i is a label and j is
+                     a state number.  The DFA number is the index into
+                     this list.  (This name is slightly confusing.)
+                     Final states are represented by a special arc of
+                     the form (0, j) where j is its own state number.
+
+    dfas          -- a dict mapping symbol numbers to (DFA, first)
+                     pairs, where DFA is an item from the states list
+                     above, and first is a set of tokens that can
+                     begin this grammar rule (represented by a dict
+                     whose values are always 1).
+
+    labels        -- a list of (x, y) pairs where x is either a token
+                     number or a symbol number, and y is either None
+                     or a string; the strings are keywords.  The label
+                     number is the index in this list; label numbers
+                     are used to mark state transitions (arcs) in the
+                     DFAs.
+
+    start         -- the number of the grammar's start symbol.
+
+    keywords      -- a dict mapping keyword strings to arc labels.
+
+    tokens        -- a dict mapping token numbers to arc labels.
+
+    """
+
+    def __init__(self):
+        self.symbol2number = collections.OrderedDict()
+        self.number2symbol = collections.OrderedDict()
+        self.states = []
+        self.dfas = collections.OrderedDict()
+        self.labels = [(0, "EMPTY")]
+        self.keywords = collections.OrderedDict()
+        self.tokens = collections.OrderedDict()
+        self.symbol2label = collections.OrderedDict()
+        self.start = 256
 
     def produce_graminit_h(self, writer):
         writer("/* Generated by Parser/pgen */\n\n")
diff --git a/Parser/pgen/pgen.py b/Parser/pgen/pgen.py
@@ -1,40 +1,23 @@
-import os
-import sys
 import collections
-import importlib.machinery
+import tokenize  # from stdlib
 
-# Use Lib/token.py and Lib/tokenize.py to obtain the tokens. To maintain this
-# compatible with older versions of Python, we need to make sure that we only
-# import these two files (and not any of the dependencies of these files).
-
-CURRENT_FOLDER_LOCATION = os.path.dirname(os.path.realpath(__file__))
-LIB_LOCATION = os.path.realpath(os.path.join(CURRENT_FOLDER_LOCATION, '..', '..', 'Lib'))
-TOKEN_LOCATION = os.path.join(LIB_LOCATION, 'token.py')
-TOKENIZE_LOCATION = os.path.join(LIB_LOCATION, 'tokenize.py')
-
-token = importlib.machinery.SourceFileLoader('token',
-                                             TOKEN_LOCATION).load_module()
-# Add token to the module cache so tokenize.py uses that excact one instead of
-# the one in the stdlib of the interpreter executing this file.
-sys.modules['token'] = token
-tokenize = importlib.machinery.SourceFileLoader('tokenize',
-                                                TOKENIZE_LOCATION).load_module()
-
-from . import grammar
+from . import grammar, token
 
 class ParserGenerator(object):
 
-    def __init__(self, filename, stream=None, verbose=False):
+    def __init__(self, grammar_file, token_file, stream=None, verbose=False):
         close_stream = None
         if stream is None:
-            stream = open(filename)
+            stream = open(grammar_file)
             close_stream = stream.close
-        self.tokens = token
-        self.opmap = token.EXACT_TOKEN_TYPES
+        with open(token_file) as tok_file:
+            token_lines = tok_file.readlines()
+        self.tokens = dict(token.generate_tokens(token_lines))
+        self.opmap = dict(token.generate_opmap(token_lines))
         # Manually add <> so it does not collide with !=
-        self.opmap['<>'] = self.tokens.NOTEQUAL
+        self.opmap['<>'] = "NOTEQUAL"
         self.verbose = verbose
-        self.filename = filename
+        self.filename = grammar_file
         self.stream = stream
         self.generator = tokenize.generate_tokens(stream.readline)
         self.gettoken() # Initialize lookahead
@@ -108,9 +91,9 @@ def make_label(self, c, label):
                     return ilabel
             else:
                 # A named token (NAME, NUMBER, STRING)
-                itoken = getattr(self.tokens, label, None)
+                itoken = self.tokens.get(label, None)
                 assert isinstance(itoken, int), label
-                assert itoken in self.tokens.tok_name, label
+                assert itoken in self.tokens.values(), label
                 if itoken in c.tokens:
                     return c.tokens[itoken]
                 else:
@@ -126,12 +109,13 @@ def make_label(self, c, label):
                 if value in c.keywords:
                     return c.keywords[value]
                 else:
-                    c.labels.append((self.tokens.NAME, value))
+                    c.labels.append((self.tokens["NAME"], value))
                     c.keywords[value] = ilabel
                     return ilabel
             else:
                 # An operator (any non-numeric token)
-                itoken = self.opmap[value] # Fails if unknown token
+                tok_name = self.opmap[value] # Fails if unknown token
+                itoken = self.tokens[tok_name]
                 if itoken in c.tokens:
                     return c.tokens[itoken]
                 else:
@@ -184,16 +168,16 @@ def parse(self):
         dfas = collections.OrderedDict()
         startsymbol = None
         # MSTART: (NEWLINE | RULE)* ENDMARKER
-        while self.type != self.tokens.ENDMARKER:
-            while self.type == self.tokens.NEWLINE:
+        while self.type != tokenize.ENDMARKER:
+            while self.type == tokenize.NEWLINE:
                 self.gettoken()
             # RULE: NAME ':' RHS NEWLINE
-            name = self.expect(self.tokens.NAME)
+            name = self.expect(tokenize.NAME)
             if self.verbose:
                 print("Processing rule {dfa_name}".format(dfa_name=name))
-            self.expect(self.tokens.OP, ":")
+            self.expect(tokenize.OP, ":")
             a, z = self.parse_rhs()
-            self.expect(self.tokens.NEWLINE)
+            self.expect(tokenize.NEWLINE)
             if self.verbose:
                 self.dump_nfa(name, a, z)
             dfa = self.make_dfa(a, z)
@@ -309,7 +293,7 @@ def parse_alt(self):
         # ALT: ITEM+
         a, b = self.parse_item()
         while (self.value in ("(", "[") or
-               self.type in (self.tokens.NAME, self.tokens.STRING)):
+               self.type in (tokenize.NAME, tokenize.STRING)):
             c, d = self.parse_item()
             b.addarc(c)
             b = d
@@ -320,7 +304,7 @@ def parse_item(self):
         if self.value == "[":
             self.gettoken()
             a, z = self.parse_rhs()
-            self.expect(self.tokens.OP, "]")
+            self.expect(tokenize.OP, "]")
             a.addarc(z)
             return a, z
         else:
@@ -340,9 +324,9 @@ def parse_atom(self):
         if self.value == "(":
             self.gettoken()
             a, z = self.parse_rhs()
-            self.expect(self.tokens.OP, ")")
+            self.expect(tokenize.OP, ")")
             return a, z
-        elif self.type in (self.tokens.NAME, self.tokens.STRING):
+        elif self.type in (tokenize.NAME, tokenize.STRING):
             a = NFAState()
             z = NFAState()
             a.addarc(z, self.value)
@@ -365,7 +349,7 @@ def gettoken(self):
         while tup[0] in (tokenize.COMMENT, tokenize.NL):
             tup = next(self.generator)
         self.type, self.value, self.begin, self.end, self.line = tup
-        #print self.tokens['tok_name'][self.type], repr(self.value)
+        # print(getattr(tokenize, 'tok_name')[self.type], repr(self.value))
 
     def raise_error(self, msg, *args):
         if args:
diff --git a/Parser/pgen/token.py b/Parser/pgen/token.py
@@ -0,0 +1,40 @@
+import itertools
+
+def generate_tokens(tokens):
+    numbers = itertools.count(0)
+    for line in tokens:
+        line = line.strip()
+
+        if not line:
+            continue
+        if line.strip().startswith('#'):
+            continue
+
+        name = line.split()[0]
+        yield (name, next(numbers))
+
+    yield ('N_TOKENS', next(numbers))
+    yield ('NT_OFFSET', 256)
+
+def generate_opmap(tokens):
+    for line in tokens:
+        line = line.strip()
+
+        if not line:
+            continue
+        if line.strip().startswith('#'):
+            continue
+
+        pieces = line.split()
+
+        if len(pieces) != 2:
+            continue
+
+        name, op = pieces
+        yield (op.strip("'"), name)
+
+    # Yield independently <>. This is needed so it does not collide
+    # with the token generation in "generate_tokens" because if this
+    # symbol is included in Grammar/Tokens, it will collide with !=
+    # as it has the same name (NOTEQUAL).
+    yield ('<>', 'NOTEQUAL')