bpo-30455: Generate all token related code and docs from Grammar/Toke…

…ns. (pythonGH-10370) "Include/token.h", "Lib/token.py" (containing now some data moved from "Lib/tokenize.py") and new files "Parser/token.c" (containing the code moved from "Parser/tokenizer.c") and "Doc/library/token-list.inc" (included in "Doc/library/token.rst") are now generated from "Grammar/Tokens" by "Tools/scripts/generate_token.py". The script overwrites files only if needed and can be used on the read-only sources tree. "Lib/symbol.py" is now generated by "Tools/scripts/generate_symbol_py.py" instead of been executable itself. Added new make targets "regen-token" and "regen-symbol" which are now dependencies of "regen-all". The documentation contains now strings for operators and punctuation tokens.
yushi-minemura · Dec 22, 2018 · 8ac6581 · 8ac6581
1 parent c1b4b0f
commit 8ac6581
Show file tree

Hide file tree

Showing 18 changed files with 940 additions and 462 deletions.
diff --git a/.gitattributes b/.gitattributes
@@ -55,3 +55,7 @@ Include/opcode.h            linguist-generated=true
 Python/opcode_targets.h     linguist-generated=true
 Objects/typeslots.inc       linguist-generated=true
 Modules/unicodedata_db.h    linguist-generated=true
+Doc/library/token-list.inc  linguist-generated=true
+Include/token.h             linguist-generated=true
+Lib/token.py                linguist-generated=true
+Parser/token.c              linguist-generated=true
diff --git a/Doc/library/token-list.inc b/Doc/library/token-list.inc
diff --git a/Doc/library/token.rst b/Doc/library/token.rst
@@ -44,64 +44,7 @@ functions.  The functions mirror definitions in the Python C header files.
 
 The token constants are:
 
-.. data:: ENDMARKER
-          NAME
-          NUMBER
-          STRING
-          NEWLINE
-          INDENT
-          DEDENT
-          LPAR
-          RPAR
-          LSQB
-          RSQB
-          COLON
-          COMMA
-          SEMI
-          PLUS
-          MINUS
-          STAR
-          SLASH
-          VBAR
-          AMPER
-          LESS
-          GREATER
-          EQUAL
-          DOT
-          PERCENT
-          LBRACE
-          RBRACE
-          EQEQUAL
-          NOTEQUAL
-          LESSEQUAL
-          GREATEREQUAL
-          TILDE
-          CIRCUMFLEX
-          LEFTSHIFT
-          RIGHTSHIFT
-          DOUBLESTAR
-          PLUSEQUAL
-          MINEQUAL
-          STAREQUAL
-          SLASHEQUAL
-          PERCENTEQUAL
-          AMPEREQUAL
-          VBAREQUAL
-          CIRCUMFLEXEQUAL
-          LEFTSHIFTEQUAL
-          RIGHTSHIFTEQUAL
-          DOUBLESTAREQUAL
-          DOUBLESLASH
-          DOUBLESLASHEQUAL
-          AT
-          ATEQUAL
-          RARROW
-          ELLIPSIS
-          OP
-          ERRORTOKEN
-          N_TOKENS
-          NT_OFFSET
-
+.. include:: token-list.inc
 
 The following token type values aren't used by the C tokenizer but are needed for
 the :mod:`tokenize` module.

diff --git a/Grammar/Tokens b/Grammar/Tokens
@@ -0,0 +1,62 @@
+ENDMARKER
+NAME
+NUMBER
+STRING
+NEWLINE
+INDENT
+DEDENT
+
+LPAR                    '('
+RPAR                    ')'
+LSQB                    '['
+RSQB                    ']'
+COLON                   ':'
+COMMA                   ','
+SEMI                    ';'
+PLUS                    '+'
+MINUS                   '-'
+STAR                    '*'
+SLASH                   '/'
+VBAR                    '|'
+AMPER                   '&'
+LESS                    '<'
+GREATER                 '>'
+EQUAL                   '='
+DOT                     '.'
+PERCENT                 '%'
+LBRACE                  '{'
+RBRACE                  '}'
+EQEQUAL                 '=='
+NOTEQUAL                '!='
+LESSEQUAL               '<='
+GREATEREQUAL            '>='
+TILDE                   '~'
+CIRCUMFLEX              '^'
+LEFTSHIFT               '<<'
+RIGHTSHIFT              '>>'
+DOUBLESTAR              '**'
+PLUSEQUAL               '+='
+MINEQUAL                '-='
+STAREQUAL               '*='
+SLASHEQUAL              '/='
+PERCENTEQUAL            '%='
+AMPEREQUAL              '&='
+VBAREQUAL               '|='
+CIRCUMFLEXEQUAL         '^='
+LEFTSHIFTEQUAL          '<<='
+RIGHTSHIFTEQUAL         '>>='
+DOUBLESTAREQUAL         '**='
+DOUBLESLASH             '//'
+DOUBLESLASHEQUAL        '//='
+AT                      '@'
+ATEQUAL                 '@='
+RARROW                  '->'
+ELLIPSIS                '...'
+
+OP
+ERRORTOKEN
+
+# These aren't used by the C tokenizer but are needed for tokenize.py
+COMMENT
+NL
+ENCODING
diff --git a/Include/token.h b/Include/token.h